df <- readRDS("swire_no_nas.rds") #inject the data and we will sub-sample
regions_joinme <- read.csv("states_summary.csv")
unique(regions_joinme$REGION)
## [1] "NORTHERN" "DESERT_SW" "PRAIRIE" "CALI_NEVADA" "MOUNTAIN"
## [6] "SOCAL" "ARIZONA" "NEWMEXICO" "NOCAL" "COLORADO"
## [11] "KANSAS"
# "NORTHERN" "DESERT_SW" "PRAIRIE" "CALI_NEVADA" "MOUNTAIN" "SOCAL" "ARIZONA" "NEWMEXICO" "NOCAL" "COLORADO" "KANSAS"
str(regions_joinme)
## 'data.frame': 200 obs. of 2 variables:
## $ MARKET_KEY: int 13 70 179 197 272 352 32 33 44 50 ...
## $ REGION : chr "NORTHERN" "NORTHERN" "DESERT_SW" "DESERT_SW" ...
# Perform a left join using the merge() function
df <- merge(df, regions_joinme[, c("MARKET_KEY", "REGION")], by = "MARKET_KEY", all.x = TRUE)
rm(regions_joinme)
# Update CALORIC_SEGMENT values: 0 if 'DIET/LIGHT', otherwise 1
df$CALORIC_SEGMENT <- ifelse(df$CALORIC_SEGMENT == "DIET/LIGHT", 0, 1)
df$MARKET_KEY <- as.character(df$MARKET_KEY)
df <- df %>%
mutate(
MONTH = as.numeric(substr(DATE, 6, 7)), # Extract the month from YYYY-MM-DD format
SEASON = case_when(
MONTH %in% c(12, 01, 02) ~ "WINTER",
MONTH %in% c(03, 04, 05) ~ "SPRING",
MONTH %in% c(06, 07, 08) ~ "SUMMER",
MONTH %in% c(09, 10, 11) ~ "FALL",
TRUE ~ NA_character_ # This is just in case there are any undefined values
)
)
str(df)
## 'data.frame': 24461424 obs. of 13 variables:
## $ MARKET_KEY : chr "1" "1" "1" "1" ...
## $ DATE : chr "2021-10-16" "2022-06-04" "2022-02-05" "2022-10-08" ...
## $ CALORIC_SEGMENT: num 0 0 1 0 0 1 0 0 1 0 ...
## $ CATEGORY : chr "ENERGY" "SSD" "SSD" "SSD" ...
## $ UNIT_SALES : num 434 28 42 1 26 161 6 5 68 90 ...
## $ DOLLAR_SALES : num 924.04 147.77 25.13 0.99 94.56 ...
## $ MANUFACTURER : chr "PONYS" "SWIRE-CC" "COCOS" "JOLLYS" ...
## $ BRAND : chr "MYTHICAL BEVERAGE ULTRA" "DIET PEPPY CF" "HANSENIZZLE'S ECO" "DIET PAPI" ...
## $ PACKAGE : chr "16SMALL MULTI CUP" "12SMALL 12ONE CUP" "12SMALL 6ONE CUP" "12SMALL 6ONE CUP" ...
## $ ITEM : chr "MYTHICAL BEVERAGE ULTRA SUNRISE ENERGY DRINK UNFLAVORED ZERO SUGAR CUP 16 LIQUID SMALL" "DIET PEPPY CAFFEINE FREE GENTLE DRINK RED PEPPER COLA DIET CUP 12 LIQUID SMALL X12" "HANSENIZZLE'S ECO GENTLE DRINK MANDARIN DURIAN CUP 12 LIQUID SMALL" "DIET PAPI GENTLE DRINK COLA DIET CUP 12 LIQUID SMALL" ...
## $ REGION : chr "NORTHERN" "NORTHERN" "NORTHERN" "NORTHERN" ...
## $ MONTH : num 10 6 2 10 7 9 9 6 10 5 ...
## $ SEASON : chr "FALL" "SUMMER" "WINTER" "FALL" ...
# Assuming df is your dataframe
set.seed(123) # Set a random seed for reproducibility
sampled_df <- df[sample(1:nrow(df), 2446143), ]
rm(df)
df <- sampled_df
rm(sampled_df)
#skim(df)
summary(df)
## MARKET_KEY DATE CALORIC_SEGMENT CATEGORY
## Length:2446143 Length:2446143 Min. :0.0000 Length:2446143
## Class :character Class :character 1st Qu.:0.0000 Class :character
## Mode :character Mode :character Median :1.0000 Mode :character
## Mean :0.5025
## 3rd Qu.:1.0000
## Max. :1.0000
## UNIT_SALES DOLLAR_SALES MANUFACTURER BRAND
## Min. : 0.04 Min. : 0.0 Length:2446143 Length:2446143
## 1st Qu.: 11.00 1st Qu.: 36.5 Class :character Class :character
## Median : 40.00 Median : 135.1 Mode :character Mode :character
## Mean : 173.43 Mean : 587.4
## 3rd Qu.: 126.00 3rd Qu.: 427.4
## Max. :91778.00 Max. :409159.3
## PACKAGE ITEM REGION MONTH
## Length:2446143 Length:2446143 Length:2446143 Min. : 1.000
## Class :character Class :character Class :character 1st Qu.: 3.000
## Mode :character Mode :character Mode :character Median : 6.000
## Mean : 6.283
## 3rd Qu.: 9.000
## Max. :12.000
## SEASON
## Length:2446143
## Class :character
## Mode :character
##
##
##
# Perform a linear regression with UNIT_SALES as the dependent variable
# and PRICE (or your chosen variable) as the independent variable
linear_model <- lm(DOLLAR_SALES ~ UNIT_SALES, data = df)
# Print the summary of the linear model to see the results
summary(linear_model)
##
## Call:
## lm(formula = DOLLAR_SALES ~ UNIT_SALES, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -140089 -117 -68 -3 225329
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 69.056096 1.023439 67.47 <2e-16 ***
## UNIT_SALES 2.989060 0.001201 2489.17 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1567 on 2446141 degrees of freedom
## Multiple R-squared: 0.717, Adjusted R-squared: 0.717
## F-statistic: 6.196e+06 on 1 and 2446141 DF, p-value: < 2.2e-16
# Create a scatter plot with the regression line, colored by MANUFACTURER
ggplot(df, aes(x = UNIT_SALES, y = DOLLAR_SALES, color = MANUFACTURER)) +
geom_point(alpha = 0.5) + # Adjust alpha to avoid overplotting, if necessary
geom_smooth(method = "lm", color = "black", se = FALSE) + # Add linear regression line without confidence band for clarity
labs(title = "Linear Model of UNIT_SALES vs. DOLLAR_SALES by MANUFACTURER",
x = "UNIT SALES",
y = "DOLLAR SALES") +
theme_minimal() +
theme(legend.position = "bottom") # Adjust legend position if needed
## `geom_smooth()` using formula = 'y ~ x'
# create a table of total values by brand
brand_summary <- df %>%
group_by(BRAND) %>%
summarise(
total_units_sold = sum(UNIT_SALES),
total_revenue = sum(DOLLAR_SALES),
avg_price = total_revenue / total_units_sold,
total_days_sold = n() # Count the number of rows for each brand
) %>%
arrange(desc(total_revenue)) %>% # Order by revenue in descending order
mutate(rank = row_number())
summary(brand_summary)
## BRAND total_units_sold total_revenue avg_price
## Length:288 Min. : 1 Min. : 1 Min. : 0.5315
## Class :character 1st Qu.: 2310 1st Qu.: 7563 1st Qu.: 2.0861
## Mode :character Median : 94691 Median : 266075 Median : 3.0291
## Mean : 1473003 Mean : 4989427 Mean : 3.2661
## 3rd Qu.: 651385 3rd Qu.: 2161764 3rd Qu.: 3.7252
## Max. :40414038 Max. :159387186 Max. :42.9378
## total_days_sold rank
## Min. : 1.0 Min. : 1.00
## 1st Qu.: 121.8 1st Qu.: 72.75
## Median : 1988.0 Median :144.50
## Mean : 8493.5 Mean :144.50
## 3rd Qu.: 8075.8 3rd Qu.:216.25
## Max. :124603.0 Max. :288.00
print(brand_summary[brand_summary$BRAND == "DIET SMASH", ])
## # A tibble: 1 × 6
## BRAND total_units_sold total_revenue avg_price total_days_sold rank
## <chr> <dbl> <dbl> <dbl> <int> <int>
## 1 DIET SMASH 50496. 210377. 4.17 1763 150
Out of 288 brands, DIET SMASH slides in at 150th place in terms of total revenue at an above average price $4.17 vs overall $3.27.
# Filter the dataframe for only 'DIET SMASH'
filtered_df <- df %>%
filter(BRAND == "DIET SMASH")
# Create the plot
ggplot(filtered_df, aes(x = UNIT_SALES, y = DOLLAR_SALES)) +
geom_point(color = "red", alpha = 1) + # Bright red points with full opacity
geom_smooth(method = "lm", color = "black", se = FALSE) + # Add linear regression line without confidence band
labs(title = "Linear Model of UNIT_SALES vs. DOLLAR_SALES for DIET SMASH",
x = "UNIT SALES",
y = "DOLLAR SALES") +
theme_minimal() +
theme(legend.position = "none")
## `geom_smooth()` using formula = 'y ~ x'
DIET SMASH is not a big seller at base line (our sample only contains abourt 1800 observations). There are 2 distinct trend lines, a high flier group and a low flier group. The high flier group follows the trend line better, while staying mostly above it. The low flier group underperforms the trend line signficantly. The high flier group begins to take an outlier distribution abover 100 unit sales, where dollar sales begin to rapidly outpace the trend line. The low flier group has a much less steep slope, and remains farily tight to about 125 unit sales, but as soon as things hit 150 the dollar sales begin to rise almost verically to meet their peers in the high flier group. Once DIET SMASH hits 150 unit sales or so the dollars start to roll in.
filtered_df %>%
mutate(DATE = as.Date(DATE)) %>%
mutate(WEEK = as.integer(format(DATE, "%U"))) %>%
group_by(WEEK) %>%
summarise(total_sales = sum(UNIT_SALES)) %>%
ggplot(aes(x = WEEK, y = total_sales)) +
geom_line(color = "black") + # Blue line connecting points
labs(title = "Total Sales by Week of the Year",
x = "Week of the Year",
y = "Total Unit Sales") +
theme_minimal()
There are 4 or 5 signficant peaks in sales by week of Diet Smash with a fairly strong clustering between 20 - 35 weeks.
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# Calculate total sales for each group of 13 consecutive weeks
sales_by_group <- filtered_df %>%
mutate(DATE = as.Date(DATE)) %>%
mutate(WEEK = as.integer(format(DATE, "%U"))) %>%
group_by(WEEK) %>%
summarise(total_sales = sum(UNIT_SALES)) %>%
mutate(sales_in_group = rollsum(total_sales, 13, align = "left", fill = NA)) %>%
mutate(week_label = paste0("Week ", WEEK + 1, " to Week ", WEEK + 13)) %>%
arrange(WEEK) %>% # Order by WEEK
filter(!is.na(sales_in_group)) # Remove rows with sales_in_group = NA
# Plot the bar chart
sales_by_group$week_label <- factor(sales_by_group$week_label, levels = sales_by_group$week_label[order(sales_by_group$WEEK)])
ggplot(sales_by_group, aes(x = factor(week_label), y = sales_in_group)) +
geom_bar(stat = "identity", fill = "black") +
labs(title = "Total Sales for Each 13-Week Grouping",
x = "Weeks (Starting from Week 1)",
y = "Total Sales") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
> DIET SMASH has a strong prounced ride up around week 8 and 35, with
peak 13 week between week 22 to 34.
#find the best 13 weeks for Kiwano sales
# Calculate total sales for each group of 13 consecutive weeks
sales_by_plum <- df %>%
filter(str_detect(ITEM, "PLUM")) %>%
mutate(DATE = as.Date(DATE)) %>%
mutate(WEEK = as.integer(format(DATE, "%U"))) %>%
group_by(WEEK) %>%
summarise(total_sales = sum(UNIT_SALES)) %>%
mutate(sales_in_group = rollsum(total_sales, 13, align = "left", fill = NA)) %>%
mutate(week_label = paste0("Week ", WEEK + 1, " to Week ", WEEK + 13)) %>%
arrange(WEEK) %>% # Order by WEEK
filter(!is.na(sales_in_group)) # Remove rows with sales_in_group = NA
# Plot the bar chart
sales_by_plum$week_label <- factor(sales_by_plum$week_label, levels = sales_by_plum$week_label[order(sales_by_plum$WEEK)])
ggplot(sales_by_plum, aes(x = factor(week_label), y = sales_in_group)) +
geom_bar(stat = "identity", fill = "black") +
labs(title = "Total Sales for Each 13-Week Grouping",
x = "Weeks (Starting from Week 1)",
y = "Total Sales") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Plum generally trends upward with the strongest sales after week 30, with week 40 to week 52 being the peak 13 week period.
#find the best 13 weeks for plum, ssd, diet, small or one package sales
# Calculate total sales for each group of 13 consecutive weeks
#PLUM flavor does not come in DIET so we will assume that the DIET sales are the same as the REGULAR sales
sales_by_innovation <- df %>%
filter(CATEGORY == "SSD",
str_detect(ITEM, "PLUM"),
str_detect(PACKAGE, "12"),
str_detect(PACKAGE, 'ONE')) %>%
mutate(DATE = as.Date(DATE)) %>%
mutate(WEEK = as.integer(format(DATE, "%U"))) %>%
group_by(WEEK) %>%
summarise(total_sales = sum(UNIT_SALES)) %>%
mutate(sales_in_group = rollsum(total_sales, 13, align = "left", fill = NA)) %>%
mutate(week_label = paste0("Week ", WEEK + 1, " to Week ", WEEK + 13)) %>%
arrange(WEEK) %>% # Order by WEEK
filter(!is.na(sales_in_group)) # Remove rows with sales_in_group = NA
# Plot the bar chart
sales_by_innovation$week_label <- factor(sales_by_innovation$week_label, levels = sales_by_innovation$week_label[order(sales_by_innovation$WEEK)])
ggplot(sales_by_innovation, aes(x = factor(week_label), y = sales_in_group)) +
geom_bar(stat = "identity", fill = "black") +
labs(title = "Total Sales for Each 13-Week Grouping",
x = "Weeks (Starting from Week 1)",
y = "Total Sales") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
With Package, Plum, and SSD we sales increaing slighltyly through the year with week 32 though 45 being the best.
#create innovation based on SSD, Plum, and packages that come in 12 (nearest number to 11) and ONE
innovation<- df %>%
filter(CATEGORY == "SSD",
str_detect(ITEM, "PLUM"),
str_detect(PACKAGE, '12'),
str_detect(PACKAGE, 'ONE'))
#unique PACKAGE string from innovation
print(unique(innovation$PACKAGE))
## [1] "12SMALL 12ONE CUP" "12SMALL 18ONE CUP"
## [3] "12SMALL 24ONE CUP" "12SMALL 6ONE CUP"
## [5] "12SMALL 20ONE CUP" "12SMALL 24ONE PLASTICS JUG"
library(dplyr)
library(lubridate)
innovation <- innovation %>%
mutate(
MONTH = month(ymd(DATE)), # Extract month using lubridate's ymd function
MONTH = as.factor(MONTH) # Convert the extracted month into a factor
)
str(innovation)
## 'data.frame': 5157 obs. of 13 variables:
## $ MARKET_KEY : chr "806" "915" "331" "953" ...
## $ DATE : chr "2022-07-23" "2021-07-10" "2023-10-28" "2023-06-10" ...
## $ CALORIC_SEGMENT: num 1 1 1 1 1 1 1 1 1 1 ...
## $ CATEGORY : chr "SSD" "SSD" "SSD" "SSD" ...
## $ UNIT_SALES : num 7 3 19 4 77 78 69 2 12 4 ...
## $ DOLLAR_SALES : num 38.43 9.87 165.81 43.92 469.72 ...
## $ MANUFACTURER : chr "JOLLYS" "BEARS" "JOLLYS" "JOLLYS" ...
## $ BRAND : chr "HILL MOISTURE THRASHED APPLE" "SINGLE GROUP" "BEAUTIFUL GREENER" "BEAUTIFUL GREENER" ...
## $ PACKAGE : chr "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" "12SMALL 18ONE CUP" "12SMALL 24ONE CUP" ...
## $ ITEM : chr "RAINING GENTLE DRINK THRASHED PLUM CUP 12 LIQUID SMALL X12" "ZIZZLES GENTLE DRINK PLUM CUP 12 LIQUID SMALL X12" "BEAUTIFUL GREENER GENTLE DRINK PLUM CUP 12 LIQUID SMALL X18" "BEAUTIFUL GREENER GENTLE DRINK PLUM CUP 12 LIQUID SMALL X24" ...
## $ REGION : chr "SOCAL" "DESERT_SW" "SOCAL" "ARIZONA" ...
## $ MONTH : Factor w/ 12 levels "1","2","3","4",..: 7 7 10 6 5 3 9 7 5 2 ...
## $ SEASON : chr "SUMMER" "SUMMER" "FALL" "SUMMER" ...
print(unique(innovation$ITEM))
## [1] "RAINING GENTLE DRINK THRASHED PLUM CUP 12 LIQUID SMALL X12"
## [2] "ZIZZLES GENTLE DRINK PLUM CUP 12 LIQUID SMALL X12"
## [3] "BEAUTIFUL GREENER GENTLE DRINK PLUM CUP 12 LIQUID SMALL X18"
## [4] "BEAUTIFUL GREENER GENTLE DRINK PLUM CUP 12 LIQUID SMALL X24"
## [5] "BEAUTIFUL GREENER GENTLE DRINK PLUM CUP 12 LIQUID SMALL X12"
## [6] "RAINING GENTLE DRINK THRASHED PLUM CUP 12 LIQUID SMALL"
## [7] "BEAUTIFUL GREENER GENTLE DRINK PLUM CUP 12 LIQUID SMALL X20"
## [8] "BEAUTIFUL GREENER GENTLE DRINK PLUM CUP 12 LIQUID SMALL"
## [9] "BEAUTIFUL GREENER GENTLE DRINK PLUM JUG 12 LIQUID SMALL X24"
# Count the number of unique PACKAGE column of our sample
table(innovation$PACKAGE)
##
## 12SMALL 12ONE CUP 12SMALL 18ONE CUP
## 4470 158
## 12SMALL 20ONE CUP 12SMALL 24ONE CUP
## 47 419
## 12SMALL 24ONE PLASTICS JUG 12SMALL 6ONE CUP
## 30 33
# Creating an 'innovation' data frame
model <- lm(DOLLAR_SALES ~ UNIT_SALES + CALORIC_SEGMENT + PACKAGE + SEASON + REGION, data = innovation)
summary(model)
##
## Call:
## lm(formula = DOLLAR_SALES ~ UNIT_SALES + CALORIC_SEGMENT + PACKAGE +
## SEASON + REGION, data = innovation)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1713.7 -20.3 7.2 36.1 4054.2
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -43.94160 6.80689 -6.455 1.18e-10 ***
## UNIT_SALES 5.21325 0.02878 181.161 < 2e-16 ***
## CALORIC_SEGMENT NA NA NA NA
## PACKAGE12SMALL 18ONE CUP 82.68936 14.85976 5.565 2.76e-08 ***
## PACKAGE12SMALL 20ONE CUP 45.36041 25.20527 1.800 0.071976 .
## PACKAGE12SMALL 24ONE CUP 278.66657 9.04304 30.816 < 2e-16 ***
## PACKAGE12SMALL 24ONE PLASTICS JUG 78.90410 31.40814 2.512 0.012028 *
## PACKAGE12SMALL 6ONE CUP -27.55356 29.95315 -0.920 0.357674
## SEASONSPRING 17.21712 6.77551 2.541 0.011080 *
## SEASONSUMMER 23.75171 6.73407 3.527 0.000424 ***
## SEASONWINTER 9.11365 6.95267 1.311 0.189979
## REGIONCALI_NEVADA 24.74310 14.96553 1.653 0.098323 .
## REGIONCOLORADO 40.01146 8.34877 4.793 1.69e-06 ***
## REGIONDESERT_SW 33.93184 10.13193 3.349 0.000817 ***
## REGIONKANSAS 182.94177 17.53758 10.431 < 2e-16 ***
## REGIONMOUNTAIN 20.75593 9.47696 2.190 0.028558 *
## REGIONNEWMEXICO 27.12353 13.03611 2.081 0.037516 *
## REGIONNOCAL -10.41959 10.94085 -0.952 0.340961
## REGIONNORTHERN 39.43195 7.81663 5.045 4.70e-07 ***
## REGIONPRAIRIE 33.61715 16.23417 2.071 0.038431 *
## REGIONSOCAL -25.08608 8.17700 -3.068 0.002167 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 170.3 on 5137 degrees of freedom
## Multiple R-squared: 0.8777, Adjusted R-squared: 0.8773
## F-statistic: 1941 on 19 and 5137 DF, p-value: < 2.2e-16
Good gravy 0.87 R-squared. This model is a beast. PACKAGE12SMALL 24ONE CUP and KANSAS are the most significant variables. Seasonally Spring and Summer are the most signficant and best times to sell.
#More exploration
library(dplyr)
small_group <- df %>%
filter(UNIT_SALES < 600, DOLLAR_SALES < 3600)
skim(small_group)
| Name | small_group |
| Number of rows | 2303607 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| MARKET_KEY | 0 | 1 | 1 | 4 | 0 | 200 | 0 |
| DATE | 0 | 1 | 10 | 10 | 0 | 152 | 0 |
| CATEGORY | 0 | 1 | 3 | 18 | 0 | 5 | 0 |
| MANUFACTURER | 0 | 1 | 5 | 8 | 0 | 8 | 0 |
| BRAND | 0 | 1 | 4 | 56 | 0 | 288 | 0 |
| PACKAGE | 0 | 1 | 11 | 26 | 0 | 95 | 0 |
| ITEM | 0 | 1 | 26 | 142 | 0 | 2999 | 0 |
| REGION | 0 | 1 | 5 | 11 | 0 | 11 | 0 |
| SEASON | 0 | 1 | 4 | 6 | 0 | 4 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| CALORIC_SEGMENT | 0 | 1 | 0.49 | 0.50 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▇ |
| UNIT_SALES | 0 | 1 | 80.31 | 110.56 | 0.04 | 10.00 | 35.00 | 102.00 | 599.00 | ▇▁▁▁▁ |
| DOLLAR_SALES | 0 | 1 | 284.84 | 432.53 | 0.01 | 33.12 | 118.54 | 342.48 | 3599.94 | ▇▁▁▁▁ |
| MONTH | 0 | 1 | 6.28 | 3.44 | 1.00 | 3.00 | 6.00 | 9.00 | 12.00 | ▇▆▆▅▇ |
Our sample looks farily representative as the mean and sd are quite close to the full df for DIET SMASH.
# Create a scatter plot with the regression line, colored by MANUFACTURER
ggplot(small_group, aes(x = UNIT_SALES, y = DOLLAR_SALES, color = MANUFACTURER)) +
geom_point(alpha = 0.5) + # Adjust alpha to avoid overplotting, if necessary
geom_smooth(method = "lm", color = "black", se = FALSE) + # Add linear regression line without confidence band for clarity
labs(title = "Linear Model of UNIT_SALES vs. DOLLAR_SALES by MANUFACTURER",
x = "UNTI SALES",
y = "DOLLAR SALES") +
theme_minimal() +
theme(legend.position = "bottom") # Adjust legend position if needed
## `geom_smooth()` using formula = 'y ~ x'
Behold the realm of DIET SMASH. Certain items sell much better, or wosrse with consideration of slop of dollars to units sold. The overall trend line in this realm is below that of DIET SMASH, as DIET SMASH is almost $1000 at 200 units sold while the realm is 375 units sold to get to $1000.
#Make the small plum df > Investigating drinks with Plum as a flavor in the Item description.
# Create a new data frame with only the rows where the ITEM column contains the word 'plum'
plum_small <- df[grep("plum", df$ITEM, ignore.case = TRUE), ]
skim(plum_small)
| Name | plum_small |
| Number of rows | 28981 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| MARKET_KEY | 0 | 1 | 1 | 4 | 0 | 200 | 0 |
| DATE | 0 | 1 | 10 | 10 | 0 | 152 | 0 |
| CATEGORY | 0 | 1 | 3 | 18 | 0 | 4 | 0 |
| MANUFACTURER | 0 | 1 | 5 | 8 | 0 | 6 | 0 |
| BRAND | 0 | 1 | 5 | 30 | 0 | 21 | 0 |
| PACKAGE | 0 | 1 | 12 | 26 | 0 | 20 | 0 |
| ITEM | 0 | 1 | 48 | 126 | 0 | 52 | 0 |
| REGION | 0 | 1 | 5 | 11 | 0 | 11 | 0 |
| SEASON | 0 | 1 | 4 | 6 | 0 | 4 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| CALORIC_SEGMENT | 0 | 1 | 0.60 | 0.49 | 0.0 | 0.00 | 1.00 | 1.00 | 1.00 | ▆▁▁▁▇ |
| UNIT_SALES | 0 | 1 | 57.73 | 182.93 | 1.0 | 7.00 | 23.00 | 62.00 | 19588.00 | ▇▁▁▁▁ |
| DOLLAR_SALES | 0 | 1 | 168.67 | 545.14 | 0.5 | 17.64 | 61.28 | 173.03 | 56603.31 | ▇▁▁▁▁ |
| MONTH | 0 | 1 | 6.34 | 3.51 | 1.0 | 3.00 | 6.00 | 10.00 | 12.00 | ▇▆▅▃▇ |
Plum as a flavor has lower unit sales mean of 58 and dollar sales mean of $169 as compared to overall Diet Smash at 80 mean unit sales and mean of $235 dolalr sales.
# plum small is dataframe
model <- lm(DOLLAR_SALES ~ UNIT_SALES + CALORIC_SEGMENT + PACKAGE + CATEGORY + SEASON + REGION, data = plum_small)
summary(model)
##
## Call:
## lm(formula = DOLLAR_SALES ~ UNIT_SALES + CALORIC_SEGMENT + PACKAGE +
## CATEGORY + SEASON + REGION, data = plum_small)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3296.9 -35.7 5.2 42.1 6282.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.662e+02 1.101e+01 -15.092 < 2e-16 ***
## UNIT_SALES 2.855e+00 4.672e-03 611.008 < 2e-16 ***
## CALORIC_SEGMENT 2.254e+01 3.289e+00 6.854 7.35e-12 ***
## PACKAGE12SMALL 12ONE CUP 1.104e+02 8.626e+00 12.799 < 2e-16 ***
## PACKAGE12SMALL 18ONE CUP 5.029e+01 1.389e+01 3.621 0.000294 ***
## PACKAGE12SMALL 20ONE CUP 2.126e+01 2.202e+01 0.965 0.334501
## PACKAGE12SMALL 24ONE CUP 3.102e+02 1.031e+01 30.093 < 2e-16 ***
## PACKAGE12SMALL 24ONE PLASTICS JUG 3.357e+01 2.683e+01 1.251 0.210785
## PACKAGE12SMALL 6ONE CUP -2.337e+01 2.577e+01 -0.907 0.364466
## PACKAGE12SMALL 8ONE CUP 1.411e+02 1.017e+01 13.881 < 2e-16 ***
## PACKAGE12SMALL MLT PLASTICS JUG -6.406e+01 9.688e+00 -6.612 3.86e-11 ***
## PACKAGE12SMALL MULTI CUP 1.308e+02 1.091e+01 11.985 < 2e-16 ***
## PACKAGE16SMALL 12ONE CUP 1.705e+02 2.270e+01 7.509 6.14e-14 ***
## PACKAGE16SMALL 4ONE CUP 1.716e+02 1.564e+01 10.973 < 2e-16 ***
## PACKAGE16SMALL MULTI CUP 8.494e+01 1.044e+01 8.133 4.35e-16 ***
## PACKAGE20SMALL MULTI JUG -6.308e+01 8.786e+00 -7.179 7.17e-13 ***
## PACKAGE24SMALL MLT SHADYES JUG -3.669e+01 2.074e+01 -1.769 0.076839 .
## PACKAGE24SMALL MULTI CUP 1.658e+02 1.961e+01 8.459 < 2e-16 ***
## PACKAGE2L MULTI JUG -9.688e+01 8.963e+00 -10.809 < 2e-16 ***
## PACKAGE8SMALL 4ONE CUP 2.530e+02 1.215e+01 20.825 < 2e-16 ***
## PACKAGE8SMALL MULTI CUP 9.426e+01 1.165e+01 8.092 6.09e-16 ***
## PACKAGEALL OTHER ONES 9.939e+01 1.070e+01 9.290 < 2e-16 ***
## CATEGORYING ENHANCED WATER 2.111e+02 1.022e+01 20.652 < 2e-16 ***
## CATEGORYSPARKLING WATER 6.433e+01 4.377e+00 14.698 < 2e-16 ***
## CATEGORYSSD 1.509e+02 6.341e+00 23.803 < 2e-16 ***
## SEASONSPRING 1.835e+00 2.363e+00 0.777 0.437353
## SEASONSUMMER 4.347e+00 2.447e+00 1.776 0.075661 .
## SEASONWINTER -4.054e+00 2.367e+00 -1.713 0.086711 .
## REGIONCALI_NEVADA 6.767e+00 4.883e+00 1.386 0.165813
## REGIONCOLORADO 2.440e+01 2.874e+00 8.492 < 2e-16 ***
## REGIONDESERT_SW 2.447e+00 3.705e+00 0.661 0.508907
## REGIONKANSAS 3.333e+01 5.680e+00 5.868 4.47e-09 ***
## REGIONMOUNTAIN 7.940e+00 3.202e+00 2.479 0.013171 *
## REGIONNEWMEXICO 1.678e+01 4.336e+00 3.870 0.000109 ***
## REGIONNOCAL 2.055e+00 4.092e+00 0.502 0.615463
## REGIONNORTHERN -6.459e+00 2.562e+00 -2.522 0.011688 *
## REGIONPRAIRIE 8.138e+00 5.232e+00 1.555 0.119841
## REGIONSOCAL 1.362e+00 3.126e+00 0.436 0.662929
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 139.8 on 28943 degrees of freedom
## Multiple R-squared: 0.9343, Adjusted R-squared: 0.9343
## F-statistic: 1.113e+04 on 37 and 28943 DF, p-value: < 2.2e-16
Adjusted R squared of 0.934, just amazing. Plum does well in Colorado, Kansas, and New Mexico (the last two being similar to Diet Smash). The two existing packages that Diet Smash comes in also are significant, for Plum (2L Multi JUG fairly strong negative like Diet Smash and 12small 12one CUP just slightly positive). The 4 week package run, PACKAGE12SMALL 6ONE CUP, from Diet Smash is not statisically signficant for Plum either, which could be cause of concern for what sounds to be a similar package of 11small 4one “JUG|CUP”. Although “All Other Ones”, could be a good proxy for innovation packaging and is strongly signficant. The only thing that is not significant across the board is the season for plum, although Summer and Winter could be edge cases thrown off by some noise.
Reworking the subset plum for more feature engineering.
plum_small <- plum_small %>%
mutate(
PACKAGE2 = str_extract(ITEM, "(CUP|JUG).*"), # Extracts the part from CUP or JUG to the end.
ITEM = str_replace(ITEM, "(CUP|JUG).*", "") # Replaces the CUP/JUG and everything after it with empty string in ITEM.
)
#plum_small
plum_small <- plum_small %>%
mutate(
TEMP = str_extract(ITEM, "\\d+\\.?\\d*.*"), # Extracts the part from the first number to the end.
PACKAGE2 = if_else(is.na(PACKAGE2), TEMP, paste(PACKAGE2, TEMP)), # Combines existing PACKAGE2 with new extraction if needed.
ITEM = str_replace(ITEM, "\\d+\\.?\\d*.*", ""), # Removes the numeric part and everything after it from ITEM.
TEMP = NULL # Removes the temporary column.
)
#plum_small
na_rows <- plum_small %>%
filter(is.na(PACKAGE2))
#na_rows
#the above steps excised all packaging out of ITEM column
plum_small <- plum_small %>%
mutate(
GENTLE_DRINK = if_else(str_detect(ITEM, "GENTLE DRINK"), 1, 0), # Assigns 1 if "GENTLE DRINK" exists, otherwise 0.
ITEM = str_replace(ITEM, "GENTLE DRINK", "") # Removes "GENTLE DRINK" from ITEM.
)
#plum_small
plum_small <- plum_small %>%
mutate(
ENERGY_DRINK = if_else(str_detect(ITEM, "ENERGY DRINK"), 1, 0), # Assigns 1 if "ENERGY DRINK" exists, otherwise 0.
ITEM = str_replace(ITEM, "ENERGY DRINK", "") # Removes "ENERGY DRINK" from ITEM.
)
#plum_small
library(dplyr)
library(stringr)
# Define the pattern as a regular expression
pattern <- "ZERO CALORIES|ZERO CALORIE|ZERO SUGAR|SUGAR FREE|NO CALORIES"
plum_small <- plum_small %>%
mutate(
CALORIC_SEGMENT_TEXT = str_extract(ITEM, pattern), # Extracts matching text based on the pattern.
ITEM = str_replace_all(ITEM, pattern, "") # Removes extracted text from ITEM.
)
#plum_small
library(dplyr)
library(stringr)
plum_small <- plum_small %>%
mutate(
CALORIC_SEGMENT_TEXT = if_else(str_detect(ITEM, "\\bDIET\\b"),
if_else(is.na(CALORIC_SEGMENT_TEXT), "DIET", paste(CALORIC_SEGMENT_TEXT, "DIET", sep=", ")),
CALORIC_SEGMENT_TEXT)
)
#plum_small
# Function to remove the second instance of any repeating word
remove_second_instance <- function(item) {
words <- unlist(str_split(item, "\\s+")) # Split item into words
unique_words <- unique(words) # Get unique words to check for repeats
for (word in unique_words) {
word_indices <- which(words == word) # Find all indices of the current word
if (length(word_indices) > 1) { # If there is more than one occurrence
words[word_indices[2]] <- "" # Remove the second occurrence
}
}
return(paste(words, collapse = " ")) # Reconstruct sentence without the second instance
}
# Apply the function to the 'ITEM' column
plum_small <- plum_small %>%
mutate(ITEM = sapply(ITEM, remove_second_instance))
# Remove specific columns
plum_small <- select(plum_small, -PACKAGE2, -GENTLE_DRINK, -ENERGY_DRINK, -CALORIC_SEGMENT_TEXT)
head(plum_small)
## MARKET_KEY DATE CALORIC_SEGMENT CATEGORY UNIT_SALES
## 9204434 424 2021-06-12 1 ENERGY 115
## 17627306 806 2022-02-26 1 ENERGY 36
## 18144465 831 2023-03-25 1 SSD 93
## 647833 32 2022-01-15 0 SPARKLING WATER 12
## 1090185 56 2021-02-06 0 ING ENHANCED WATER 45
## 17667230 806 2022-07-23 1 SSD 7
## DOLLAR_SALES MANUFACTURER BRAND
## 9204434 186.10 JOLLYS SUPER-DUPER PUNCHED
## 17627306 74.77 JOLLYS SUPER-DUPER PUNCHED
## 18144465 204.19 JOLLYS BEAUTIFUL GREENER
## 647833 43.29 BEARS CROWN
## 1090185 53.53 JOLLYS SOOOO-COOOOL FUTURE WATER ZERO
## 17667230 38.43 JOLLYS HILL MOISTURE THRASHED APPLE
## PACKAGE
## 9204434 16SMALL MULTI CUP
## 17627306 16SMALL MULTI CUP
## 18144465 20SMALL MULTI JUG
## 647833 12SMALL 8ONE CUP
## 1090185 20SMALL MULTI JUG
## 17667230 12SMALL 12ONE CUP
## ITEM
## 9204434 SUPER-DUPER PITAYA ED HARDONLY PLUM
## 17627306 SUPER-DUPER PITAYA ED HARDONLY PLUM
## 18144465 BEAUTIFUL GREENER PLUM
## 647833 CROWN SPARKLING WATER BEACH PLUM
## 1090185 SOOOO-COOOOL FUTURE WATER BEVERAGE FUJI PLUM KEEN ZERO CAL PER
## 17667230 RAINING THRASHED PLUM
## REGION MONTH SEASON
## 9204434 NORTHERN 6 SUMMER
## 17627306 SOCAL 2 WINTER
## 18144465 ARIZONA 3 SPRING
## 647833 NORTHERN 1 WINTER
## 1090185 NORTHERN 2 WINTER
## 17667230 SOCAL 7 SUMMER
We now know that there are 2 innovation aspects at play here, a new package for Diet Smash in the complete sense of 11small and 4one and an existing flavor,net new added to Diet Smash. Both our innovation plum form and the small Plum Multiple Linear regressions are incredibly high indicating that there is high potential some reasonable forecasting. Diet Smash has some significance in terms of season in Summer and Winter, but Plum by itself is right on the edge of not being significant for Summer and Winter. Our innovation data frame showed promised spring and summer. Packaging is not as strong by itself as Plum and Diet Smash ony comes in 2 regular types with 1 size that ran for 4 weeks. 12small 6one is likely pretty close for 11small 4one, but regressions in the innovation data frame showed PACKAGE12SMALL 24ONE CUP as the winner . It’s possible that together, based on Diet Smash, that the best 13 weeks are Spring and Summer for Plum, with specific week details to be determined.
df <- read_csv("swire_no_nas_w_pop.csv") #inject the data and we will sub-sample
## Rows: 24461424 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): CALORIC_SEGMENT, CATEGORY, MANUFACTURER, BRAND, PACKAGE, ITEM
## dbl (4): MARKET_KEY, UNIT_SALES, DOLLAR_SALES, POP_SQMI
## date (1): DATE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Remove things at the very beginning and drop the weird missing period near the end
#Group by ITEM with DATE Before 2021-01-01, drop those ITEM rows
# df_long_running <- df %>%
# group_by(ITEM) %>%
# filter(DATE <= "2021-01-01")
#
# #remove all rows in plum_long_running from plum
# df <- df %>%
# anti_join(df_long_running)
#Group by ITEM rows with less than 12 weeks of data
df_small <- df %>%
group_by(ITEM) %>%
filter(n() <= 12)
#remove all rows in df small that run less than 13 weeks
df <- df %>%
anti_join(df_small)
## Joining with `by = join_by(MARKET_KEY, DATE, CALORIC_SEGMENT, CATEGORY,
## UNIT_SALES, DOLLAR_SALES, MANUFACTURER, BRAND, PACKAGE, ITEM, POP_SQMI)`
#Drop rows after May 21st 2023 as there are several gaps for most brands in innovation plum
df <- df %>%
filter(DATE <= "2023-05-21")
#cleanup everything but df
rm(df_small)
#skim(df)
#start with PLUM, SSD, and Package features
plum_package <- df %>%
filter(CATEGORY == "SSD",
str_detect(ITEM, "PLUM"))
#try plum ssd diet
plum_diet <- df %>%
filter (
str_detect(ITEM, "PLUM"),
CALORIC_SEGMENT == "DIET/LIGHT")
#toss in some diet smash for good measure ans some package features
diet_smash <- df %>%
filter (
BRAND == "DIET SMASH",
CATEGORY == "SSD")
#combine the three
merged_plum_innovation <- bind_rows(plum_package, plum_diet, diet_smash)
#remove duplicate rows
df <- merged_plum_innovation %>% distinct()
skim(df)
| Name | df |
| Number of rows | 215151 |
| Number of columns | 11 |
| _______________________ | |
| Column type frequency: | |
| character | 6 |
| Date | 1 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| CALORIC_SEGMENT | 0 | 1 | 7 | 10 | 0 | 2 | 0 |
| CATEGORY | 0 | 1 | 3 | 18 | 0 | 4 | 0 |
| MANUFACTURER | 0 | 1 | 5 | 8 | 0 | 5 | 0 |
| BRAND | 0 | 1 | 5 | 30 | 0 | 15 | 0 |
| PACKAGE | 0 | 1 | 12 | 26 | 0 | 18 | 0 |
| ITEM | 0 | 1 | 48 | 126 | 0 | 39 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| DATE | 0 | 1 | 2020-12-05 | 2023-05-20 | 2022-04-09 | 129 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| MARKET_KEY | 0 | 1 | 630.96 | 613.44 | 1.00 | 303.00 | 613.00 | 882.00 | 6802.00 | ▇▁▁▁▁ |
| UNIT_SALES | 0 | 1 | 46.29 | 76.20 | 1.00 | 7.00 | 22.00 | 55.00 | 3157.00 | ▇▁▁▁▁ |
| DOLLAR_SALES | 0 | 1 | 138.90 | 260.51 | 0.03 | 18.00 | 60.39 | 156.86 | 12763.23 | ▇▁▁▁▁ |
| POP_SQMI | 0 | 1 | 1548.17 | 1855.52 | 0.18 | 47.98 | 709.34 | 2774.91 | 6769.35 | ▇▂▂▁▁ |
#cleanup all objects other than df
rm(plum_package, plum_diet, diet_smash, merged_plum_innovation)
regions_joinme <- read.csv("states_summary.csv")
unique(regions_joinme$REGION)
## [1] "NORTHERN" "DESERT_SW" "PRAIRIE" "CALI_NEVADA" "MOUNTAIN"
## [6] "SOCAL" "ARIZONA" "NEWMEXICO" "NOCAL" "COLORADO"
## [11] "KANSAS"
# "NORTHERN" "DESERT_SW" "PRAIRIE" "CALI_NEVADA" "MOUNTAIN" "SOCAL" "ARIZONA" "NEWMEXICO" "NOCAL" "COLORADO" "KANSAS"
str(regions_joinme)
## 'data.frame': 200 obs. of 2 variables:
## $ MARKET_KEY: int 13 70 179 197 272 352 32 33 44 50 ...
## $ REGION : chr "NORTHERN" "NORTHERN" "DESERT_SW" "DESERT_SW" ...
# Perform a left join using the merge() function
df <- merge(df, regions_joinme[, c("MARKET_KEY", "REGION")], by = "MARKET_KEY", all.x = TRUE)
rm(regions_joinme)
# Update CALORIC_SEGMENT values: 0 if 'DIET/LIGHT', otherwise 1
df$CALORIC_SEGMENT <- ifelse(df$CALORIC_SEGMENT == "DIET/LIGHT", 0, 1)
df$MARKET_KEY <- as.character(df$MARKET_KEY)
df <- df %>%
mutate(
MONTH = as.numeric(substr(DATE, 6, 7)), # Extract the month from YYYY-MM-DD format
SEASON = case_when(
MONTH %in% c(12, 01, 02) ~ "WINTER",
MONTH %in% c(03, 04, 05) ~ "SPRING",
MONTH %in% c(06, 07, 08) ~ "SUMMER",
MONTH %in% c(09, 10, 11) ~ "FALL",
TRUE ~ NA_character_ # This is just in case there are any undefined values
)
)
#save merged_innovation_df back to plum
plum <- df
skim(plum)
| Name | plum |
| Number of rows | 215151 |
| Number of columns | 14 |
| _______________________ | |
| Column type frequency: | |
| character | 8 |
| Date | 1 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| MARKET_KEY | 0 | 1 | 1 | 4 | 0 | 200 | 0 |
| CATEGORY | 0 | 1 | 3 | 18 | 0 | 4 | 0 |
| MANUFACTURER | 0 | 1 | 5 | 8 | 0 | 5 | 0 |
| BRAND | 0 | 1 | 5 | 30 | 0 | 15 | 0 |
| PACKAGE | 0 | 1 | 12 | 26 | 0 | 18 | 0 |
| ITEM | 0 | 1 | 48 | 126 | 0 | 39 | 0 |
| REGION | 0 | 1 | 5 | 11 | 0 | 11 | 0 |
| SEASON | 0 | 1 | 4 | 6 | 0 | 4 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| DATE | 0 | 1 | 2020-12-05 | 2023-05-20 | 2022-04-09 | 129 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| CALORIC_SEGMENT | 0 | 1 | 0.44 | 0.50 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▆ |
| UNIT_SALES | 0 | 1 | 46.29 | 76.20 | 1.00 | 7.00 | 22.00 | 55.00 | 3157.00 | ▇▁▁▁▁ |
| DOLLAR_SALES | 0 | 1 | 138.90 | 260.51 | 0.03 | 18.00 | 60.39 | 156.86 | 12763.23 | ▇▁▁▁▁ |
| POP_SQMI | 0 | 1 | 1548.17 | 1855.52 | 0.18 | 47.98 | 709.34 | 2774.91 | 6769.35 | ▇▂▂▁▁ |
| MONTH | 0 | 1 | 6.20 | 3.57 | 1.00 | 3.00 | 6.00 | 9.00 | 12.00 | ▇▆▅▃▇ |
Reworking the subset plum for more feature engineering.
plum <- plum %>%
mutate(
PACKAGE2 = str_extract(ITEM, "(CUP|JUG).*"), # Extracts the part from CUP or JUG to the end.
ITEM = str_replace(ITEM, "(CUP|JUG).*", "") # Replaces the CUP/JUG and everything after it with empty string in ITEM.
)
plum <- plum %>%
mutate(
TEMP = str_extract(ITEM, "\\d+\\.?\\d*.*"), # Extracts the part from the first number to the end.
PACKAGE2 = if_else(is.na(PACKAGE2), TEMP, paste(PACKAGE2, TEMP)), # Combines existing PACKAGE2 with new extraction if needed.
ITEM = str_replace(ITEM, "\\d+\\.?\\d*.*", ""), # Removes the numeric part and everything after it from ITEM.
TEMP = NULL # Removes the temporary column.
)
na_rows <- plum %>%
filter(is.na(PACKAGE2))
na_rows
## [1] MARKET_KEY DATE CALORIC_SEGMENT CATEGORY
## [5] UNIT_SALES DOLLAR_SALES MANUFACTURER BRAND
## [9] PACKAGE ITEM POP_SQMI REGION
## [13] MONTH SEASON PACKAGE2
## <0 rows> (or 0-length row.names)
#the above steps excised all packaging out of ITEM column
# Function to remove the second instance of any repeating word
remove_second_instance <- function(item) {
words <- unlist(str_split(item, "\\s+")) # Split item into words
unique_words <- unique(words) # Get unique words to check for repeats
for (word in unique_words) {
word_indices <- which(words == word) # Find all indices of the current word
if (length(word_indices) > 1) { # If there is more than one occurrence
words[word_indices[2]] <- "" # Remove the second occurrence
}
}
return(paste(words, collapse = " ")) # Reconstruct sentence without the second instance
}
# Apply the function to the 'ITEM' column
plum <- plum %>%
mutate(ITEM = sapply(ITEM, remove_second_instance))
# #One hot encode either "ENERGY" or "ED" in ITEM as an ENERGY_DRINK
plum$ENERGY_DRINK <- ifelse(str_detect(plum$ITEM, "ENERGY|' ED'"), 1, 0)
plum$ITEM <- str_replace(plum$ITEM, "ENERGY DRINK", "")
plum$ITEM <- str_replace(plum$ITEM, "ENERGY", "")
plum$ITEM <- str_replace(plum$ITEM, " ED", "")
table(plum$ENERGY_DRINK)
##
## 0 1
## 176398 38753
table(plum$CATEGORY)
##
## ENERGY ING ENHANCED WATER SPARKLING WATER SSD
## 38753 3760 63752 108886
plum %>%
filter(ENERGY_DRINK == 1,
CATEGORY=='SSD') %>%
select(ITEM) %>%
head(10)
## [1] ITEM
## <0 rows> (or 0-length row.names)
# Remove specific columns
#plum <- select(plum, -PACKAGE2, -CATEGORY)
head(plum)
## MARKET_KEY DATE CALORIC_SEGMENT CATEGORY UNIT_SALES DOLLAR_SALES
## 1 1 2022-06-18 1 SSD 1 4.62
## 2 1 2022-04-30 1 SSD 14 86.86
## 3 1 2021-12-11 1 SSD 18 89.73
## 4 1 2022-07-30 1 SSD 13 65.60
## 5 1 2021-11-27 1 SSD 19 72.93
## 6 1 2022-06-11 1 SSD 4 25.60
## MANUFACTURER BRAND PACKAGE
## 1 JOLLYS HILL MOISTURE THRASHED APPLE 12SMALL 12ONE CUP
## 2 JOLLYS HILL MOISTURE THRASHED APPLE 12SMALL 12ONE CUP
## 3 JOLLYS HILL MOISTURE THRASHED APPLE 12SMALL 12ONE CUP
## 4 JOLLYS HILL MOISTURE THRASHED APPLE 12SMALL 12ONE CUP
## 5 JOLLYS HILL MOISTURE THRASHED APPLE 12SMALL 12ONE CUP
## 6 JOLLYS HILL MOISTURE THRASHED APPLE 12SMALL 12ONE CUP
## ITEM POP_SQMI REGION MONTH SEASON
## 1 RAINING GENTLE DRINK THRASHED PLUM 1.201114 NORTHERN 6 SUMMER
## 2 RAINING GENTLE DRINK THRASHED PLUM 1.201114 NORTHERN 4 SPRING
## 3 RAINING GENTLE DRINK THRASHED PLUM 1.201114 NORTHERN 12 WINTER
## 4 RAINING GENTLE DRINK THRASHED PLUM 1.201114 NORTHERN 7 SUMMER
## 5 RAINING GENTLE DRINK THRASHED PLUM 1.201114 NORTHERN 11 FALL
## 6 RAINING GENTLE DRINK THRASHED PLUM 1.201114 NORTHERN 6 SUMMER
## PACKAGE2 ENERGY_DRINK
## 1 CUP 12 LIQUID SMALL X12 NA 0
## 2 CUP 12 LIQUID SMALL X12 NA 0
## 3 CUP 12 LIQUID SMALL X12 NA 0
## 4 CUP 12 LIQUID SMALL X12 NA 0
## 5 CUP 12 LIQUID SMALL X12 NA 0
## 6 CUP 12 LIQUID SMALL X12 NA 0
table(plum$ITEM)
##
## BEAUTIFUL GREENER GENTLE DRINK PLUM
## 63882
## BUBBLE JOY SPARKLING WATER PLUM NO CALORIES
## 2354
## CROWN CURATE SPARKLING WATER PLUM BERRY CALORIE FREE
## 31
## CROWN CURATE SPARKLING WATER PLUM MIXED-TROPPY ZERO CALORIE
## 3134
## CROWN SPARKLING WATER BEACH PLUM ZERO CALORIE
## 25354
## DIET SMASH GENTLE DRINK SUNSET
## 14704
## DIGRESS NOURISH ENHANCE WATER BEVERAGE PLUM KEEN
## 2324
## EXCLAMATION REFRESHER DRINK FLAVORED SPARKLING WATER FUJI PLUM PLUS WHITE NO SWEETENERS
## 21216
## EXCLAMATION REFRESHER SPARKLING WATER PLUM PLUS GINGER NO CALORIES
## 959
## FANTASMIC GENTLE DRINK CUSTARD APPLE PLUM
## 15
## FANTASMIC GENTLE DRINK PLUM
## 52
## FIZZY SPARKLING JUICE BEVERAGE PLUM NO ADDED SUGAR
## 10704
## GO-DAY GENTLE DRINK
## 499
## POW-POW
## 424
## RAINING GENTLE DRINK THRASHED PLUM
## 26462
## SOOOO-COOOOL FUTURE WATER BEVERAGE FUJI PLUM KEEN ZERO CAL PER
## 1436
## SUPER-DUPER RUN-QUICK SUPER SOUR CUSTARD APPLE PLUM SUGAR FREE
## 22970
## SUPER-DUPER RUN-QUICK SUPER SOURS CUSTARD APPLE PLUM
## 851
## SUPER-DUPER RUN-QUICK SUPER SOURS CUSTARD APPLE PLUM SUGAR FREE
## 1488
## SUPER-DUPER RUN-QUICK SUPER SOURS CUSTARD APPLE PLUM ZERO SUGAR
## 9591
## VENOMOUS BLAST PINK PLUM
## 3429
## ZIZZLES GENTLE DRINK PLUM
## 3272
#Trim trailing white space at end of ITEM
plum$ITEM <- str_trim(plum$ITEM, side = "right")
# #replace "ENERGY DRINK" with "" in ITEM
plum$ITEM <- str_replace(plum$ITEM, "GENTLE DRINK", "")
pattern <- "ZERO CALORIES|ZERO CALORIE|ZERO SUGAR|SUGAR FREE|NO CALORIES|ZERO CARB|PURE ZERO|DIET|NO SWEETENERS|ZERO CAL PER|CALORIE FREE"
plum <- plum %>%
mutate(
CALORIC_SEGMENT_TEXT = str_extract(ITEM, pattern), # Extracts matching text based on the pattern.
ITEM = str_replace_all(ITEM, pattern, "") # Removes extracted text from ITEM.
)
# Find the minimum launch date for each product
min_launch_dates <- plum %>%
group_by(ITEM) %>%
summarise(min_launch_date = min(DATE))
# Join the minimum launch dates back to the original data
plum <- plum %>%
left_join(min_launch_dates, by = "ITEM")
# Calculate the number of weeks since the product launch
plum <- plum %>%
mutate(WEEKS_SINCE_LAUNCH = as.numeric(difftime(DATE, min_launch_date, units = "weeks")))
# Selecting required columns and printing the first 10 rows
plum %>%
filter(UNIT_SALES > 0) %>%
select(DATE, ITEM, WEEKS_SINCE_LAUNCH) %>%
head(10)
## DATE ITEM WEEKS_SINCE_LAUNCH
## 1 2022-06-18 RAINING THRASHED PLUM 41
## 2 2022-04-30 RAINING THRASHED PLUM 34
## 3 2021-12-11 RAINING THRASHED PLUM 14
## 4 2022-07-30 RAINING THRASHED PLUM 47
## 5 2021-11-27 RAINING THRASHED PLUM 12
## 6 2022-06-11 RAINING THRASHED PLUM 40
## 7 2022-01-22 RAINING THRASHED PLUM 20
## 8 2021-11-06 RAINING THRASHED PLUM 9
## 9 2021-10-16 RAINING THRASHED PLUM 6
## 10 2021-12-04 RAINING THRASHED PLUM 13
#Set any negative WEEKS_SINCE_LAUNCH to 0
# plum <- plum %>%
# mutate(WEEKS_SINCE_LAUNCH = ifelse(WEEKS_SINCE_LAUNCH < 0, 0, WEEKS_SINCE_LAUNCH))
#too many brands and packages, lets try removing ITEM or CATEGORIES of WATER, ENERGY, and JUG packages
plum <- plum %>%
filter(!grepl("WATER", ITEM),
!CATEGORY=='SPARKLING WATER',
!CATEGORY=='ING ENHANCED WATER',
!CATEGORY=='ENERGY')
#Let's remove Brand Diet Smash
# plum <- plum %>%
# filter(!BRAND=='DIET SMASH')
print(unique(plum$ITEM))
## [1] "RAINING THRASHED PLUM" "BEAUTIFUL GREENER PLUM"
## [3] " SMASH SUNSET" "FANTASMIC PLUM"
## [5] "ZIZZLES PLUM" "GO-DAY "
## [7] "FANTASMIC CUSTARD APPLE PLUM"
print(unique(plum$BRAND))
## [1] "HILL MOISTURE THRASHED APPLE" "BEAUTIFUL GREENER"
## [3] "DIET SMASH" "FANTASMIC"
## [5] "SINGLE GROUP" "GO-DAY"
print(unique(plum$CATEGORY))
## [1] "SSD"
print(unique(plum$PACKAGE))
## [1] "12SMALL 12ONE CUP" "20SMALL MULTI JUG"
## [3] "2L MULTI JUG" "12SMALL 6ONE CUP"
## [5] "12SMALL 24ONE CUP" "12SMALL MLT PLASTICS JUG"
## [7] "12SMALL 20ONE CUP" ".5L 6ONE JUG"
## [9] "12SMALL 18ONE CUP" "12SMALL 24ONE PLASTICS JUG"
## [11] "24SMALL MLT SHADYES JUG"
write_csv(plum, "plum_tableau.csv")
str(plum)
## 'data.frame': 108886 obs. of 19 variables:
## $ MARKET_KEY : chr "1" "1" "1" "1" ...
## $ DATE : Date, format: "2022-06-18" "2022-04-30" ...
## $ CALORIC_SEGMENT : num 1 1 1 1 1 1 1 1 1 1 ...
## $ CATEGORY : chr "SSD" "SSD" "SSD" "SSD" ...
## $ UNIT_SALES : num 1 14 18 13 19 4 29 35 75 25 ...
## $ DOLLAR_SALES : num 4.62 86.86 89.73 65.6 72.93 ...
## $ MANUFACTURER : chr "JOLLYS" "JOLLYS" "JOLLYS" "JOLLYS" ...
## $ BRAND : chr "HILL MOISTURE THRASHED APPLE" "HILL MOISTURE THRASHED APPLE" "HILL MOISTURE THRASHED APPLE" "HILL MOISTURE THRASHED APPLE" ...
## $ PACKAGE : chr "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" ...
## $ ITEM : chr "RAINING THRASHED PLUM" "RAINING THRASHED PLUM" "RAINING THRASHED PLUM" "RAINING THRASHED PLUM" ...
## $ POP_SQMI : num 1.2 1.2 1.2 1.2 1.2 ...
## $ REGION : chr "NORTHERN" "NORTHERN" "NORTHERN" "NORTHERN" ...
## $ MONTH : num 6 4 12 7 11 6 1 11 10 12 ...
## $ SEASON : chr "SUMMER" "SPRING" "WINTER" "SUMMER" ...
## $ PACKAGE2 : chr "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" ...
## $ ENERGY_DRINK : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CALORIC_SEGMENT_TEXT: chr NA NA NA NA ...
## $ min_launch_date : Date, format: "2021-09-04" "2021-09-04" ...
## $ WEEKS_SINCE_LAUNCH : num 41 34 14 47 12 40 20 9 6 13 ...
#remove all objects other than plum
rm(list = setdiff(ls(), "plum"))
print(unique(plum$ITEM))
## [1] "RAINING THRASHED PLUM" "BEAUTIFUL GREENER PLUM"
## [3] " SMASH SUNSET" "FANTASMIC PLUM"
## [5] "ZIZZLES PLUM" "GO-DAY "
## [7] "FANTASMIC CUSTARD APPLE PLUM"
print(unique(plum$BRAND))
## [1] "HILL MOISTURE THRASHED APPLE" "BEAUTIFUL GREENER"
## [3] "DIET SMASH" "FANTASMIC"
## [5] "SINGLE GROUP" "GO-DAY"
print(unique(plum$CATEGORY))
## [1] "SSD"
print(unique(plum$PACKAGE))
## [1] "12SMALL 12ONE CUP" "20SMALL MULTI JUG"
## [3] "2L MULTI JUG" "12SMALL 6ONE CUP"
## [5] "12SMALL 24ONE CUP" "12SMALL MLT PLASTICS JUG"
## [7] "12SMALL 20ONE CUP" ".5L 6ONE JUG"
## [9] "12SMALL 18ONE CUP" "12SMALL 24ONE PLASTICS JUG"
## [11] "24SMALL MLT SHADYES JUG"
skim(plum)
| Name | plum |
| Number of rows | 108886 |
| Number of columns | 19 |
| _______________________ | |
| Column type frequency: | |
| character | 10 |
| Date | 2 |
| numeric | 7 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| MARKET_KEY | 0 | 1.00 | 1 | 4 | 0 | 197 | 0 |
| CATEGORY | 0 | 1.00 | 3 | 3 | 0 | 1 | 0 |
| MANUFACTURER | 0 | 1.00 | 5 | 8 | 0 | 4 | 0 |
| BRAND | 0 | 1.00 | 6 | 28 | 0 | 6 | 0 |
| PACKAGE | 0 | 1.00 | 12 | 26 | 0 | 11 | 0 |
| ITEM | 0 | 1.00 | 7 | 29 | 0 | 7 | 0 |
| REGION | 0 | 1.00 | 5 | 11 | 0 | 11 | 0 |
| SEASON | 0 | 1.00 | 4 | 6 | 0 | 4 | 0 |
| PACKAGE2 | 0 | 1.00 | 22 | 34 | 0 | 11 | 0 |
| CALORIC_SEGMENT_TEXT | 94182 | 0.14 | 4 | 4 | 0 | 1 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| DATE | 0 | 1 | 2020-12-05 | 2023-05-20 | 2022-04-09 | 129 |
| min_launch_date | 0 | 1 | 2020-12-05 | 2021-09-04 | 2020-12-05 | 2 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| CALORIC_SEGMENT | 0 | 1 | 0.86 | 0.34 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | ▁▁▁▁▇ |
| UNIT_SALES | 0 | 1 | 51.24 | 85.35 | 1.00 | 9.00 | 25.00 | 58.00 | 3157.00 | ▇▁▁▁▁ |
| DOLLAR_SALES | 0 | 1 | 171.84 | 327.25 | 0.31 | 27.12 | 74.65 | 184.04 | 12763.23 | ▇▁▁▁▁ |
| POP_SQMI | 0 | 1 | 1723.52 | 1926.89 | 0.18 | 57.10 | 843.08 | 3191.93 | 6769.35 | ▇▂▂▁▁ |
| MONTH | 0 | 1 | 6.21 | 3.61 | 1.00 | 3.00 | 6.00 | 10.00 | 12.00 | ▇▆▃▃▇ |
| ENERGY_DRINK | 0 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ▁▁▇▁▁ |
| WEEKS_SINCE_LAUNCH | 0 | 1 | 59.78 | 35.73 | 0.00 | 30.00 | 58.00 | 87.00 | 128.00 | ▇▇▇▆▆ |
# Creating an 'innovation' data frame
#factor all character variables
plum$REGION <- as.factor(plum$REGION)
#plum$CATEGORY <- as.factor(plum$CATEGORY)
plum$BRAND <- as.factor(plum$BRAND)
plum$SEASON <- as.factor(plum$SEASON)
plum$PACKAGE2 <- as.factor(plum$PACKAGE2)
sapply(plum, function(x) sum(is.na(x)))
## MARKET_KEY DATE CALORIC_SEGMENT
## 0 0 0
## CATEGORY UNIT_SALES DOLLAR_SALES
## 0 0 0
## MANUFACTURER BRAND PACKAGE
## 0 0 0
## ITEM POP_SQMI REGION
## 0 0 0
## MONTH SEASON PACKAGE2
## 0 0 0
## ENERGY_DRINK CALORIC_SEGMENT_TEXT min_launch_date
## 0 94182 0
## WEEKS_SINCE_LAUNCH
## 0
model <- lm(DOLLAR_SALES ~ UNIT_SALES + POP_SQMI + REGION + MONTH + SEASON + PACKAGE2 + WEEKS_SINCE_LAUNCH, data = plum)
summary(model)
##
## Call:
## lm(formula = DOLLAR_SALES ~ UNIT_SALES + POP_SQMI + REGION +
## MONTH + SEASON + PACKAGE2 + WEEKS_SINCE_LAUNCH, data = plum)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4455.9 -54.8 -0.3 53.3 7116.0
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -8.833e+01 1.002e+01 -8.817
## UNIT_SALES 3.167e+00 5.958e-03 531.591
## POP_SQMI -1.581e-03 2.987e-04 -5.292
## REGIONCALI_NEVADA 1.532e+01 2.986e+00 5.131
## REGIONCOLORADO 3.600e+01 1.649e+00 21.829
## REGIONDESERT_SW 2.789e-01 2.205e+00 0.126
## REGIONKANSAS 1.249e+02 3.948e+00 31.639
## REGIONMOUNTAIN 2.969e+01 1.963e+00 15.126
## REGIONNEWMEXICO 3.618e+01 2.563e+00 14.118
## REGIONNOCAL -6.519e+00 2.362e+00 -2.760
## REGIONNORTHERN 1.621e+01 1.691e+00 9.585
## REGIONPRAIRIE 2.519e+01 3.605e+00 6.987
## REGIONSOCAL -1.047e+01 1.717e+00 -6.102
## MONTH -1.278e-01 1.751e-01 -0.730
## SEASONSPRING 3.390e+00 1.751e+00 1.936
## SEASONSUMMER 6.594e+00 1.630e+00 4.047
## SEASONWINTER 1.239e+01 1.638e+00 7.566
## PACKAGE2CUP 12 LIQUID SMALL X12 NA 1.237e+02 9.747e+00 12.691
## PACKAGE2CUP 12 LIQUID SMALL X18 NA 1.005e+02 1.074e+01 9.355
## PACKAGE2CUP 12 LIQUID SMALL X20 NA 1.141e+02 1.237e+01 9.219
## PACKAGE2CUP 12 LIQUID SMALL X24 NA 3.215e+02 1.019e+01 31.561
## PACKAGE2CUPDY PLUM JUG 24 LIQUID SMALL NA 3.973e+00 1.215e+01 0.327
## PACKAGE2JUG 12 LIQUID SMALL NA -1.917e+01 9.898e+00 -1.937
## PACKAGE2JUG 12 LIQUID SMALL X24 NA 8.781e+01 1.432e+01 6.133
## PACKAGE2JUG 16.9 LIQUID SMALL X6 NA 5.460e+01 1.032e+01 5.290
## PACKAGE2JUG 20 LIQUID SMALL NA -3.349e+01 9.781e+00 -3.424
## PACKAGE2JUG 67.6 LIQUID SMALL NA -5.757e+01 9.780e+00 -5.886
## WEEKS_SINCE_LAUNCH 6.321e-01 1.428e-02 44.271
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## UNIT_SALES < 2e-16 ***
## POP_SQMI 1.21e-07 ***
## REGIONCALI_NEVADA 2.89e-07 ***
## REGIONCOLORADO < 2e-16 ***
## REGIONDESERT_SW 0.899340
## REGIONKANSAS < 2e-16 ***
## REGIONMOUNTAIN < 2e-16 ***
## REGIONNEWMEXICO < 2e-16 ***
## REGIONNOCAL 0.005783 **
## REGIONNORTHERN < 2e-16 ***
## REGIONPRAIRIE 2.82e-12 ***
## REGIONSOCAL 1.05e-09 ***
## MONTH 0.465285
## SEASONSPRING 0.052863 .
## SEASONSUMMER 5.20e-05 ***
## SEASONWINTER 3.87e-14 ***
## PACKAGE2CUP 12 LIQUID SMALL X12 NA < 2e-16 ***
## PACKAGE2CUP 12 LIQUID SMALL X18 NA < 2e-16 ***
## PACKAGE2CUP 12 LIQUID SMALL X20 NA < 2e-16 ***
## PACKAGE2CUP 12 LIQUID SMALL X24 NA < 2e-16 ***
## PACKAGE2CUPDY PLUM JUG 24 LIQUID SMALL NA 0.743719
## PACKAGE2JUG 12 LIQUID SMALL NA 0.052742 .
## PACKAGE2JUG 12 LIQUID SMALL X24 NA 8.65e-10 ***
## PACKAGE2JUG 16.9 LIQUID SMALL X6 NA 1.23e-07 ***
## PACKAGE2JUG 20 LIQUID SMALL NA 0.000616 ***
## PACKAGE2JUG 67.6 LIQUID SMALL NA 3.96e-09 ***
## WEEKS_SINCE_LAUNCH < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 162 on 108858 degrees of freedom
## Multiple R-squared: 0.7551, Adjusted R-squared: 0.755
## F-statistic: 1.243e+04 on 27 and 108858 DF, p-value: < 2.2e-16
# Creating an 'innovation' data frame
model <- lm(UNIT_SALES ~ DOLLAR_SALES + PACKAGE + POP_SQMI + REGION + MONTH + SEASON + PACKAGE2 + WEEKS_SINCE_LAUNCH, data = plum)
summary(model)
##
## Call:
## lm(formula = UNIT_SALES ~ DOLLAR_SALES + PACKAGE + POP_SQMI +
## REGION + MONTH + SEASON + PACKAGE2 + WEEKS_SINCE_LAUNCH,
## data = plum)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1269.04 -17.40 -2.92 8.59 1866.48
##
## Coefficients: (10 not defined because of singularities)
## Estimate Std. Error t value
## (Intercept) 1.401e+01 1.146e+00 12.232
## DOLLAR_SALES 2.279e-01 4.288e-04 531.591
## PACKAGE12SMALL 12ONE CUP -2.518e+00 9.556e-01 -2.635
## PACKAGE12SMALL 18ONE CUP -1.101e+01 1.494e+00 -7.370
## PACKAGE12SMALL 20ONE CUP -1.446e+01 2.240e+00 -6.455
## PACKAGE12SMALL 24ONE CUP -5.018e+01 1.219e+00 -41.158
## PACKAGE12SMALL 24ONE PLASTICS JUG -8.691e+00 2.954e+00 -2.942
## PACKAGE12SMALL 6ONE CUP 8.009e+00 2.769e+00 2.892
## PACKAGE12SMALL MLT PLASTICS JUG 2.241e+01 1.042e+00 21.511
## PACKAGE20SMALL MULTI JUG 3.427e+01 9.721e-01 35.256
## PACKAGE24SMALL MLT SHADYES JUG 1.183e+01 2.183e+00 5.421
## PACKAGE2L MULTI JUG 4.033e+01 9.746e-01 41.386
## POP_SQMI 1.420e-03 8.003e-05 17.740
## REGIONCALI_NEVADA -1.223e+01 8.002e-01 -15.283
## REGIONCOLORADO -1.016e+01 4.423e-01 -22.962
## REGIONDESERT_SW -3.747e+00 5.913e-01 -6.337
## REGIONKANSAS -1.066e+01 1.064e+00 -10.026
## REGIONMOUNTAIN -1.536e+00 5.272e-01 -2.915
## REGIONNEWMEXICO -1.105e+01 6.873e-01 -16.072
## REGIONNOCAL -4.461e+00 6.335e-01 -7.042
## REGIONNORTHERN -6.514e+00 4.535e-01 -14.362
## REGIONPRAIRIE -3.686e-01 9.672e-01 -0.381
## REGIONSOCAL 4.021e+00 4.605e-01 8.732
## MONTH 1.598e-01 4.696e-02 3.403
## SEASONSPRING -4.214e+00 4.696e-01 -8.975
## SEASONSUMMER -5.857e+00 4.368e-01 -13.408
## SEASONWINTER -7.436e+00 4.389e-01 -16.943
## PACKAGE2CUP 12 LIQUID SMALL X12 NA NA NA NA
## PACKAGE2CUP 12 LIQUID SMALL X18 NA NA NA NA
## PACKAGE2CUP 12 LIQUID SMALL X20 NA NA NA NA
## PACKAGE2CUP 12 LIQUID SMALL X24 NA NA NA NA
## PACKAGE2CUPDY PLUM JUG 24 LIQUID SMALL NA NA NA NA
## PACKAGE2JUG 12 LIQUID SMALL NA NA NA NA
## PACKAGE2JUG 12 LIQUID SMALL X24 NA NA NA NA
## PACKAGE2JUG 16.9 LIQUID SMALL X6 NA NA NA NA
## PACKAGE2JUG 20 LIQUID SMALL NA NA NA NA
## PACKAGE2JUG 67.6 LIQUID SMALL NA NA NA NA
## WEEKS_SINCE_LAUNCH -2.029e-01 3.816e-03 -53.184
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## DOLLAR_SALES < 2e-16 ***
## PACKAGE12SMALL 12ONE CUP 0.008408 **
## PACKAGE12SMALL 18ONE CUP 1.72e-13 ***
## PACKAGE12SMALL 20ONE CUP 1.08e-10 ***
## PACKAGE12SMALL 24ONE CUP < 2e-16 ***
## PACKAGE12SMALL 24ONE PLASTICS JUG 0.003259 **
## PACKAGE12SMALL 6ONE CUP 0.003827 **
## PACKAGE12SMALL MLT PLASTICS JUG < 2e-16 ***
## PACKAGE20SMALL MULTI JUG < 2e-16 ***
## PACKAGE24SMALL MLT SHADYES JUG 5.93e-08 ***
## PACKAGE2L MULTI JUG < 2e-16 ***
## POP_SQMI < 2e-16 ***
## REGIONCALI_NEVADA < 2e-16 ***
## REGIONCOLORADO < 2e-16 ***
## REGIONDESERT_SW 2.35e-10 ***
## REGIONKANSAS < 2e-16 ***
## REGIONMOUNTAIN 0.003562 **
## REGIONNEWMEXICO < 2e-16 ***
## REGIONNOCAL 1.91e-12 ***
## REGIONNORTHERN < 2e-16 ***
## REGIONPRAIRIE 0.703160
## REGIONSOCAL < 2e-16 ***
## MONTH 0.000666 ***
## SEASONSPRING < 2e-16 ***
## SEASONSUMMER < 2e-16 ***
## SEASONWINTER < 2e-16 ***
## PACKAGE2CUP 12 LIQUID SMALL X12 NA NA
## PACKAGE2CUP 12 LIQUID SMALL X18 NA NA
## PACKAGE2CUP 12 LIQUID SMALL X20 NA NA
## PACKAGE2CUP 12 LIQUID SMALL X24 NA NA
## PACKAGE2CUPDY PLUM JUG 24 LIQUID SMALL NA NA
## PACKAGE2JUG 12 LIQUID SMALL NA NA
## PACKAGE2JUG 12 LIQUID SMALL X24 NA NA
## PACKAGE2JUG 16.9 LIQUID SMALL X6 NA NA
## PACKAGE2JUG 20 LIQUID SMALL NA NA
## PACKAGE2JUG 67.6 LIQUID SMALL NA NA
## WEEKS_SINCE_LAUNCH < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 43.45 on 108858 degrees of freedom
## Multiple R-squared: 0.7409, Adjusted R-squared: 0.7408
## F-statistic: 1.153e+04 on 27 and 108858 DF, p-value: < 2.2e-16
# Creating an 'innovation' data frame
model <- lm(UNIT_SALES ~ CALORIC_SEGMENT + PACKAGE + POP_SQMI + REGION + MONTH + SEASON + PACKAGE2 + WEEKS_SINCE_LAUNCH, data = plum)
summary(model)
##
## Call:
## lm(formula = UNIT_SALES ~ CALORIC_SEGMENT + PACKAGE + POP_SQMI +
## REGION + MONTH + SEASON + PACKAGE2 + WEEKS_SINCE_LAUNCH,
## data = plum)
##
## Residuals:
## Min 1Q Median 3Q Max
## -139.95 -39.95 -17.33 12.65 3062.52
##
## Coefficients: (10 not defined because of singularities)
## Estimate Std. Error t value
## (Intercept) -1.055e+01 2.289e+00 -4.608
## CALORIC_SEGMENT 3.402e+01 7.918e-01 42.963
## PACKAGE12SMALL 12ONE CUP 5.193e+01 1.791e+00 28.994
## PACKAGE12SMALL 18ONE CUP -1.486e+00 2.810e+00 -0.529
## PACKAGE12SMALL 20ONE CUP -2.962e+00 4.212e+00 -0.703
## PACKAGE12SMALL 24ONE CUP 3.805e+01 2.271e+00 16.752
## PACKAGE12SMALL 24ONE PLASTICS JUG -4.557e+00 5.554e+00 -0.820
## PACKAGE12SMALL 6ONE CUP -1.798e+01 5.207e+00 -3.454
## PACKAGE12SMALL MLT PLASTICS JUG 1.954e+01 1.959e+00 9.976
## PACKAGE20SMALL MULTI JUG 5.010e+01 1.827e+00 27.421
## PACKAGE24SMALL MLT SHADYES JUG 5.998e-01 4.104e+00 0.146
## PACKAGE2L MULTI JUG 6.015e+01 1.839e+00 32.703
## POP_SQMI 3.627e-03 1.503e-04 24.130
## REGIONCALI_NEVADA -3.425e+01 1.505e+00 -22.762
## REGIONCOLORADO -8.409e+00 8.324e-01 -10.103
## REGIONDESERT_SW -1.531e+01 1.112e+00 -13.759
## REGIONKANSAS 6.144e+01 1.983e+00 30.978
## REGIONMOUNTAIN 1.591e+01 9.910e-01 16.050
## REGIONNEWMEXICO -1.301e+01 1.294e+00 -10.052
## REGIONNOCAL -2.429e+01 1.192e+00 -20.382
## REGIONNORTHERN -3.342e+00 8.672e-01 -3.853
## REGIONPRAIRIE 1.636e+01 1.819e+00 8.998
## REGIONSOCAL 3.244e+00 8.680e-01 3.737
## MONTH 4.880e-01 8.830e-02 5.527
## SEASONSPRING -1.274e+01 8.826e-01 -14.436
## SEASONSUMMER -1.448e+01 8.211e-01 -17.636
## SEASONWINTER -1.657e+01 8.247e-01 -20.091
## PACKAGE2CUP 12 LIQUID SMALL X12 NA NA NA NA
## PACKAGE2CUP 12 LIQUID SMALL X18 NA NA NA NA
## PACKAGE2CUP 12 LIQUID SMALL X20 NA NA NA NA
## PACKAGE2CUP 12 LIQUID SMALL X24 NA NA NA NA
## PACKAGE2CUPDY PLUM JUG 24 LIQUID SMALL NA NA NA NA
## PACKAGE2JUG 12 LIQUID SMALL NA NA NA NA
## PACKAGE2JUG 12 LIQUID SMALL X24 NA NA NA NA
## PACKAGE2JUG 16.9 LIQUID SMALL X6 NA NA NA NA
## PACKAGE2JUG 20 LIQUID SMALL NA NA NA NA
## PACKAGE2JUG 67.6 LIQUID SMALL NA NA NA NA
## WEEKS_SINCE_LAUNCH -1.885e-01 7.195e-03 -26.200
## Pr(>|t|)
## (Intercept) 4.06e-06 ***
## CALORIC_SEGMENT < 2e-16 ***
## PACKAGE12SMALL 12ONE CUP < 2e-16 ***
## PACKAGE12SMALL 18ONE CUP 0.596812
## PACKAGE12SMALL 20ONE CUP 0.481934
## PACKAGE12SMALL 24ONE CUP < 2e-16 ***
## PACKAGE12SMALL 24ONE PLASTICS JUG 0.411970
## PACKAGE12SMALL 6ONE CUP 0.000553 ***
## PACKAGE12SMALL MLT PLASTICS JUG < 2e-16 ***
## PACKAGE20SMALL MULTI JUG < 2e-16 ***
## PACKAGE24SMALL MLT SHADYES JUG 0.883803
## PACKAGE2L MULTI JUG < 2e-16 ***
## POP_SQMI < 2e-16 ***
## REGIONCALI_NEVADA < 2e-16 ***
## REGIONCOLORADO < 2e-16 ***
## REGIONDESERT_SW < 2e-16 ***
## REGIONKANSAS < 2e-16 ***
## REGIONMOUNTAIN < 2e-16 ***
## REGIONNEWMEXICO < 2e-16 ***
## REGIONNOCAL < 2e-16 ***
## REGIONNORTHERN 0.000117 ***
## REGIONPRAIRIE < 2e-16 ***
## REGIONSOCAL 0.000186 ***
## MONTH 3.27e-08 ***
## SEASONSPRING < 2e-16 ***
## SEASONSUMMER < 2e-16 ***
## SEASONWINTER < 2e-16 ***
## PACKAGE2CUP 12 LIQUID SMALL X12 NA NA
## PACKAGE2CUP 12 LIQUID SMALL X18 NA NA
## PACKAGE2CUP 12 LIQUID SMALL X20 NA NA
## PACKAGE2CUP 12 LIQUID SMALL X24 NA NA
## PACKAGE2CUPDY PLUM JUG 24 LIQUID SMALL NA NA
## PACKAGE2JUG 12 LIQUID SMALL NA NA
## PACKAGE2JUG 12 LIQUID SMALL X24 NA NA
## PACKAGE2JUG 16.9 LIQUID SMALL X6 NA NA
## PACKAGE2JUG 20 LIQUID SMALL NA NA
## PACKAGE2JUG 67.6 LIQUID SMALL NA NA
## WEEKS_SINCE_LAUNCH < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 81.71 on 108858 degrees of freedom
## Multiple R-squared: 0.08372, Adjusted R-squared: 0.08349
## F-statistic: 368.4 on 27 and 108858 DF, p-value: < 2.2e-16
#weekly sales where ITEM contains "RAINING THRASHED PLUM"
plum %>%
filter(str_detect(ITEM, "RAINING THRASHED PLUM")) %>%
group_by(DATE) %>%
summarise(UNIT_SALES = sum(UNIT_SALES)) %>%
ggplot(aes(x = DATE, y = UNIT_SALES)) +
geom_line() +
labs(title = "Weekly Sales of 'RAINING THRASHED PLUM'", x = "Week of Year", y = "Unit Sales")
# Load and prepare dataset
df <- read.csv("plum_tableau.csv")
# Load and prepare dataset
str(df)
## 'data.frame': 108886 obs. of 19 variables:
## $ MARKET_KEY : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DATE : chr "2022-06-18" "2022-04-30" "2021-12-11" "2022-07-30" ...
## $ CALORIC_SEGMENT : int 1 1 1 1 1 1 1 1 1 1 ...
## $ CATEGORY : chr "SSD" "SSD" "SSD" "SSD" ...
## $ UNIT_SALES : int 1 14 18 13 19 4 29 35 75 25 ...
## $ DOLLAR_SALES : num 4.62 86.86 89.73 65.6 72.93 ...
## $ MANUFACTURER : chr "JOLLYS" "JOLLYS" "JOLLYS" "JOLLYS" ...
## $ BRAND : chr "HILL MOISTURE THRASHED APPLE" "HILL MOISTURE THRASHED APPLE" "HILL MOISTURE THRASHED APPLE" "HILL MOISTURE THRASHED APPLE" ...
## $ PACKAGE : chr "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" ...
## $ ITEM : chr "RAINING THRASHED PLUM" "RAINING THRASHED PLUM" "RAINING THRASHED PLUM" "RAINING THRASHED PLUM" ...
## $ POP_SQMI : num 1.2 1.2 1.2 1.2 1.2 ...
## $ REGION : chr "NORTHERN" "NORTHERN" "NORTHERN" "NORTHERN" ...
## $ MONTH : int 6 4 12 7 11 6 1 11 10 12 ...
## $ SEASON : chr "SUMMER" "SPRING" "WINTER" "SUMMER" ...
## $ PACKAGE2 : chr "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" ...
## $ ENERGY_DRINK : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CALORIC_SEGMENT_TEXT: chr NA NA NA NA ...
## $ min_launch_date : chr "2021-09-04" "2021-09-04" "2021-09-04" "2021-09-04" ...
## $ WEEKS_SINCE_LAUNCH : int 41 34 14 47 12 40 20 9 6 13 ...
plum <- df %>%
#select(-DATE, -MONTH, -SEASON, -BRAND, -REGION, -ITEM )
select(-MONTH, -SEASON, -min_launch_date, -PACKAGE2, -CALORIC_SEGMENT_TEXT)
print(unique(plum$ITEM))
## [1] "RAINING THRASHED PLUM" "BEAUTIFUL GREENER PLUM"
## [3] " SMASH SUNSET" "FANTASMIC PLUM"
## [5] "ZIZZLES PLUM" "GO-DAY "
## [7] "FANTASMIC CUSTARD APPLE PLUM"
print(unique(plum$BRAND))
## [1] "HILL MOISTURE THRASHED APPLE" "BEAUTIFUL GREENER"
## [3] "DIET SMASH" "FANTASMIC"
## [5] "SINGLE GROUP" "GO-DAY"
print(unique(plum$CATEGORY))
## [1] "SSD"
print(unique(plum$PACKAGE))
## [1] "12SMALL 12ONE CUP" "20SMALL MULTI JUG"
## [3] "2L MULTI JUG" "12SMALL 6ONE CUP"
## [5] "12SMALL 24ONE CUP" "12SMALL MLT PLASTICS JUG"
## [7] "12SMALL 20ONE CUP" ".5L 6ONE JUG"
## [9] "12SMALL 18ONE CUP" "12SMALL 24ONE PLASTICS JUG"
## [11] "24SMALL MLT SHADYES JUG"
# Assuming plum is your data frame and PACKAGING is the column of interest
# Create new columns in woodsy for each unique substring
# Each column will have a 1 if the substring is found in the PACKAGING column, 0 otherwise
plum$`12SMALL 12ONE CUP` = as.integer(grepl("12SMALL 12ONE CUP", plum$PACKAGE))
plum$`20SMALL MULTI JUG` = as.integer(grepl("20SMALL MULTI JUG", plum$PACKAGE))
plum$`12SMALL 6ONE CUP` = as.integer(grepl("12SMALL 6ONE CUP", plum$PACKAGE))
plum$`12SMALL 24ONE CUP` = as.integer(grepl("12SMALL 24ONE CUP", plum$PACKAGE))
plum$`12SMALL MLT PLASTICS JUG` = as.integer(grepl("12SMALL MLT PLASTICS JUG", plum$PACKAGE))
plum$`.5L 6ONE JUG` = as.integer(grepl(".5L 6ONE JUG", plum$PACKAGE))
plum$`12SMALL 20ONE CUP` = as.integer(grepl("12SMALL 20ONE CUP", plum$PACKAGE))
plum$`12SMALL 18ONE CUP` = as.integer(grepl("12SMALL 18ONE CUP", plum$PACKAGE))
plum$`12SMALL 24ONE PLASTICS JUG` = as.integer(grepl("12SMALL 24ONE PLASTICS JUG", plum$PACKAGE))
#one hot encode non brand ITEM strings
#"RAINING THRASHED PLUM" "BEAUTIFUL GREENER PLUM" "ZIZZLES PLUM"
plum$`RAINING THRASHED PLUM` = as.integer(grepl("RAINING THRASHED PLUM", plum$ITEM))
plum$`BEAUTIFUL GREENER PLUM` = as.integer(grepl("BEAUTIFUL GREENER PLUM", plum$ITEM))
plum$`ZIZZLES PLUM` = as.integer(grepl("ZIZZLES PLUM", plum$ITEM))
# Print the head of the data frame to see the first few rows
head(plum)
## MARKET_KEY DATE CALORIC_SEGMENT CATEGORY UNIT_SALES DOLLAR_SALES
## 1 1 2022-06-18 1 SSD 1 4.62
## 2 1 2022-04-30 1 SSD 14 86.86
## 3 1 2021-12-11 1 SSD 18 89.73
## 4 1 2022-07-30 1 SSD 13 65.60
## 5 1 2021-11-27 1 SSD 19 72.93
## 6 1 2022-06-11 1 SSD 4 25.60
## MANUFACTURER BRAND PACKAGE
## 1 JOLLYS HILL MOISTURE THRASHED APPLE 12SMALL 12ONE CUP
## 2 JOLLYS HILL MOISTURE THRASHED APPLE 12SMALL 12ONE CUP
## 3 JOLLYS HILL MOISTURE THRASHED APPLE 12SMALL 12ONE CUP
## 4 JOLLYS HILL MOISTURE THRASHED APPLE 12SMALL 12ONE CUP
## 5 JOLLYS HILL MOISTURE THRASHED APPLE 12SMALL 12ONE CUP
## 6 JOLLYS HILL MOISTURE THRASHED APPLE 12SMALL 12ONE CUP
## ITEM POP_SQMI REGION ENERGY_DRINK WEEKS_SINCE_LAUNCH
## 1 RAINING THRASHED PLUM 1.201114 NORTHERN 0 41
## 2 RAINING THRASHED PLUM 1.201114 NORTHERN 0 34
## 3 RAINING THRASHED PLUM 1.201114 NORTHERN 0 14
## 4 RAINING THRASHED PLUM 1.201114 NORTHERN 0 47
## 5 RAINING THRASHED PLUM 1.201114 NORTHERN 0 12
## 6 RAINING THRASHED PLUM 1.201114 NORTHERN 0 40
## 12SMALL 12ONE CUP 20SMALL MULTI JUG 12SMALL 6ONE CUP 12SMALL 24ONE CUP
## 1 1 0 0 0
## 2 1 0 0 0
## 3 1 0 0 0
## 4 1 0 0 0
## 5 1 0 0 0
## 6 1 0 0 0
## 12SMALL MLT PLASTICS JUG .5L 6ONE JUG 12SMALL 20ONE CUP 12SMALL 18ONE CUP
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## 12SMALL 24ONE PLASTICS JUG RAINING THRASHED PLUM BEAUTIFUL GREENER PLUM
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## ZIZZLES PLUM
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
plum$CATEGORY <- NULL
plum$MARKET_KEY <- NULL
plum$MANUFACTURER <- NULL
plum$PACKAGE <- NULL
library(fastDummies)
## Thank you for using fastDummies!
## To acknowledge our work, please cite the package:
## Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.
# One-hot encode the specified columns
plum <- fastDummies::dummy_cols(plum, select_columns = c("REGION", "ITEM"), remove_selected_columns = TRUE)
#plum <- fastDummies::dummy_cols(plum, select_columns = c("REGION", "SEASON","ITEM"), remove_selected_columns = TRUE)
# View the first few rows to verify the changes
head(plum)
## DATE CALORIC_SEGMENT UNIT_SALES DOLLAR_SALES
## 1 2022-06-18 1 1 4.62
## 2 2022-04-30 1 14 86.86
## 3 2021-12-11 1 18 89.73
## 4 2022-07-30 1 13 65.60
## 5 2021-11-27 1 19 72.93
## 6 2022-06-11 1 4 25.60
## BRAND POP_SQMI ENERGY_DRINK WEEKS_SINCE_LAUNCH
## 1 HILL MOISTURE THRASHED APPLE 1.201114 0 41
## 2 HILL MOISTURE THRASHED APPLE 1.201114 0 34
## 3 HILL MOISTURE THRASHED APPLE 1.201114 0 14
## 4 HILL MOISTURE THRASHED APPLE 1.201114 0 47
## 5 HILL MOISTURE THRASHED APPLE 1.201114 0 12
## 6 HILL MOISTURE THRASHED APPLE 1.201114 0 40
## 12SMALL 12ONE CUP 20SMALL MULTI JUG 12SMALL 6ONE CUP 12SMALL 24ONE CUP
## 1 1 0 0 0
## 2 1 0 0 0
## 3 1 0 0 0
## 4 1 0 0 0
## 5 1 0 0 0
## 6 1 0 0 0
## 12SMALL MLT PLASTICS JUG .5L 6ONE JUG 12SMALL 20ONE CUP 12SMALL 18ONE CUP
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## 12SMALL 24ONE PLASTICS JUG RAINING THRASHED PLUM BEAUTIFUL GREENER PLUM
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## ZIZZLES PLUM REGION_ARIZONA REGION_CALI_NEVADA REGION_COLORADO
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## REGION_DESERT_SW REGION_KANSAS REGION_MOUNTAIN REGION_NEWMEXICO REGION_NOCAL
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
## 6 0 0 0 0 0
## REGION_NORTHERN REGION_PRAIRIE REGION_SOCAL ITEM_ SMASH SUNSET
## 1 1 0 0 0
## 2 1 0 0 0
## 3 1 0 0 0
## 4 1 0 0 0
## 5 1 0 0 0
## 6 1 0 0 0
## ITEM_BEAUTIFUL GREENER PLUM ITEM_FANTASMIC CUSTARD APPLE PLUM
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## ITEM_FANTASMIC PLUM ITEM_GO-DAY ITEM_RAINING THRASHED PLUM
## 1 0 0 1
## 2 0 0 1
## 3 0 0 1
## 4 0 0 1
## 5 0 0 1
## 6 0 0 1
## ITEM_ZIZZLES PLUM
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
write.csv(plum, "plum_one_hot.csv", row.names = FALSE)
library(fastDummies)
# One-hot encode
plum <- fastDummies::dummy_cols(plum, select_columns = "BRAND", remove_selected_columns = TRUE)
# View the first few rows to verify
head(plum)
## DATE CALORIC_SEGMENT UNIT_SALES DOLLAR_SALES POP_SQMI ENERGY_DRINK
## 1 2022-06-18 1 1 4.62 1.201114 0
## 2 2022-04-30 1 14 86.86 1.201114 0
## 3 2021-12-11 1 18 89.73 1.201114 0
## 4 2022-07-30 1 13 65.60 1.201114 0
## 5 2021-11-27 1 19 72.93 1.201114 0
## 6 2022-06-11 1 4 25.60 1.201114 0
## WEEKS_SINCE_LAUNCH 12SMALL 12ONE CUP 20SMALL MULTI JUG 12SMALL 6ONE CUP
## 1 41 1 0 0
## 2 34 1 0 0
## 3 14 1 0 0
## 4 47 1 0 0
## 5 12 1 0 0
## 6 40 1 0 0
## 12SMALL 24ONE CUP 12SMALL MLT PLASTICS JUG .5L 6ONE JUG 12SMALL 20ONE CUP
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## 12SMALL 18ONE CUP 12SMALL 24ONE PLASTICS JUG RAINING THRASHED PLUM
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## BEAUTIFUL GREENER PLUM ZIZZLES PLUM REGION_ARIZONA REGION_CALI_NEVADA
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## REGION_COLORADO REGION_DESERT_SW REGION_KANSAS REGION_MOUNTAIN
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## REGION_NEWMEXICO REGION_NOCAL REGION_NORTHERN REGION_PRAIRIE REGION_SOCAL
## 1 0 0 1 0 0
## 2 0 0 1 0 0
## 3 0 0 1 0 0
## 4 0 0 1 0 0
## 5 0 0 1 0 0
## 6 0 0 1 0 0
## ITEM_ SMASH SUNSET ITEM_BEAUTIFUL GREENER PLUM
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## ITEM_FANTASMIC CUSTARD APPLE PLUM ITEM_FANTASMIC PLUM ITEM_GO-DAY
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## ITEM_RAINING THRASHED PLUM ITEM_ZIZZLES PLUM BRAND_BEAUTIFUL GREENER
## 1 1 0 0
## 2 1 0 0
## 3 1 0 0
## 4 1 0 0
## 5 1 0 0
## 6 1 0 0
## BRAND_DIET SMASH BRAND_FANTASMIC BRAND_GO-DAY
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## BRAND_HILL MOISTURE THRASHED APPLE BRAND_SINGLE GROUP
## 1 1 0
## 2 1 0
## 3 1 0
## 4 1 0
## 5 1 0
## 6 1 0
#create new week of year column
plum <- plum %>%
mutate(DATE = as.Date(DATE)) %>%
mutate(WEEK_OF_YEAR = lubridate::week(DATE))
#Drop DATE column
plum$DATE <- NULL
# Summarize the dataset
skimr::skim(plum)
| Name | plum |
| Number of rows | 108886 |
| Number of columns | 43 |
| _______________________ | |
| Column type frequency: | |
| numeric | 43 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| CALORIC_SEGMENT | 0 | 1 | 0.86 | 0.34 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | ▁▁▁▁▇ |
| UNIT_SALES | 0 | 1 | 51.24 | 85.35 | 1.00 | 9.00 | 25.00 | 58.00 | 3157.00 | ▇▁▁▁▁ |
| DOLLAR_SALES | 0 | 1 | 171.84 | 327.25 | 0.31 | 27.12 | 74.65 | 184.04 | 12763.23 | ▇▁▁▁▁ |
| POP_SQMI | 0 | 1 | 1723.52 | 1926.89 | 0.18 | 57.10 | 843.08 | 3191.93 | 6769.35 | ▇▂▂▁▁ |
| ENERGY_DRINK | 0 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ▁▁▇▁▁ |
| WEEKS_SINCE_LAUNCH | 0 | 1 | 59.78 | 35.73 | 0.00 | 30.00 | 58.00 | 87.00 | 128.00 | ▇▇▇▆▆ |
| 12SMALL 12ONE CUP | 0 | 1 | 0.44 | 0.50 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▆ |
| 20SMALL MULTI JUG | 0 | 1 | 0.21 | 0.41 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| 12SMALL 6ONE CUP | 0 | 1 | 0.00 | 0.05 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| 12SMALL 24ONE CUP | 0 | 1 | 0.03 | 0.16 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| 12SMALL MLT PLASTICS JUG | 0 | 1 | 0.07 | 0.25 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| .5L 6ONE JUG | 0 | 1 | 0.02 | 0.14 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| 12SMALL 20ONE CUP | 0 | 1 | 0.00 | 0.06 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| 12SMALL 18ONE CUP | 0 | 1 | 0.01 | 0.11 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| 12SMALL 24ONE PLASTICS JUG | 0 | 1 | 0.00 | 0.05 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| RAINING THRASHED PLUM | 0 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ▁▁▇▁▁ |
| BEAUTIFUL GREENER PLUM | 0 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ▁▁▇▁▁ |
| ZIZZLES PLUM | 0 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ▁▁▇▁▁ |
| REGION_ARIZONA | 0 | 1 | 0.28 | 0.45 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▃ |
| REGION_CALI_NEVADA | 0 | 1 | 0.03 | 0.17 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| REGION_COLORADO | 0 | 1 | 0.13 | 0.34 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| REGION_DESERT_SW | 0 | 1 | 0.06 | 0.24 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| REGION_KANSAS | 0 | 1 | 0.02 | 0.13 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| REGION_MOUNTAIN | 0 | 1 | 0.09 | 0.28 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| REGION_NEWMEXICO | 0 | 1 | 0.04 | 0.20 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| REGION_NOCAL | 0 | 1 | 0.06 | 0.23 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| REGION_NORTHERN | 0 | 1 | 0.13 | 0.34 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| REGION_PRAIRIE | 0 | 1 | 0.02 | 0.14 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| REGION_SOCAL | 0 | 1 | 0.14 | 0.34 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| ITEM_ SMASH SUNSET | 0 | 1 | 0.14 | 0.34 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| ITEM_BEAUTIFUL GREENER PLUM | 0 | 1 | 0.59 | 0.49 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▆▁▁▁▇ |
| ITEM_FANTASMIC CUSTARD APPLE PLUM | 0 | 1 | 0.00 | 0.01 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| ITEM_FANTASMIC PLUM | 0 | 1 | 0.00 | 0.02 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| ITEM_GO-DAY | 0 | 1 | 0.00 | 0.07 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| ITEM_RAINING THRASHED PLUM | 0 | 1 | 0.24 | 0.43 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| ITEM_ZIZZLES PLUM | 0 | 1 | 0.03 | 0.17 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| BRAND_BEAUTIFUL GREENER | 0 | 1 | 0.59 | 0.49 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▆▁▁▁▇ |
| BRAND_DIET SMASH | 0 | 1 | 0.14 | 0.34 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| BRAND_FANTASMIC | 0 | 1 | 0.00 | 0.02 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| BRAND_GO-DAY | 0 | 1 | 0.00 | 0.07 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| BRAND_HILL MOISTURE THRASHED APPLE | 0 | 1 | 0.24 | 0.43 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| BRAND_SINGLE GROUP | 0 | 1 | 0.03 | 0.17 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| WEEK_OF_YEAR | 0 | 1 | 25.40 | 15.86 | 1.00 | 11.00 | 23.00 | 40.00 | 53.00 | ▇▇▅▅▆ |
#remove top 10 percent of unit sales to clean up outliers
df <- plum %>%
filter(UNIT_SALES < quantile(UNIT_SALES, 0.99))
# Split the data
set.seed(123)
df_testtrn <- initial_split(df, prop = 0.8, strata = UNIT_SALES)
Train <- training(df_testtrn)
Test <- testing(df_testtrn)
# Prepare features and labels for XGBoost
train_features <- Train[, -which(names(Train) == "UNIT_SALES")]
train_labels <- Train$UNIT_SALES
test_features <- Test[, -which(names(Test) == "UNIT_SALES")]
test_labels <- Test$UNIT_SALES
# Convert data to DMatrix format
dtrain <- xgb.DMatrix(data = as.matrix(train_features), label = train_labels)
dtest <- xgb.DMatrix(data = as.matrix(test_features), label = test_labels)
# Define XGBoost parameters
set.seed(123)
params <- list(
booster = "gbtree",
objective = "reg:squarederror",
eval_metric = "rmse",
eta = 0.05,
max_depth = 4,
min_child_weight = 3,
subsample = 0.7,
colsample_bytree = 0.6,
lambda = 1,
alpha = 1
)
# Perform cross-validation to find the optimal number of boosting rounds
cv_results <- xgb.cv(
params = params,
data = dtrain,
nfold = 5,
nrounds = 500, # Changed from 'num_boost_round' to 'nrounds'
early_stopping_rounds = 10,
metrics = "rmse",
seed = 123
)
## [1] train-rmse:72.137771+0.694189 test-rmse:72.122812+1.667395
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
##
## [2] train-rmse:69.255785+1.419294 test-rmse:69.241268+2.195574
## [3] train-rmse:66.960070+1.396622 test-rmse:66.952155+1.992800
## [4] train-rmse:65.320298+1.809715 test-rmse:65.317026+2.342574
## [5] train-rmse:63.264055+1.898804 test-rmse:63.253139+2.160364
## [6] train-rmse:60.868240+1.911852 test-rmse:60.873311+2.366105
## [7] train-rmse:58.958888+2.123034 test-rmse:58.995136+2.756431
## [8] train-rmse:57.365471+1.877309 test-rmse:57.416698+2.755472
## [9] train-rmse:55.350107+1.948074 test-rmse:55.387263+2.507748
## [10] train-rmse:53.003066+1.855188 test-rmse:53.046293+2.402330
## [11] train-rmse:51.623821+2.252422 test-rmse:51.666774+2.642721
## [12] train-rmse:50.048931+2.282919 test-rmse:50.086026+2.464909
## [13] train-rmse:48.162787+2.171677 test-rmse:48.206389+2.344030
## [14] train-rmse:47.255850+2.153060 test-rmse:47.301651+2.274130
## [15] train-rmse:46.203406+2.591040 test-rmse:46.258844+2.674439
## [16] train-rmse:44.504130+2.471292 test-rmse:44.556336+2.514257
## [17] train-rmse:43.386682+2.152247 test-rmse:43.442540+2.358840
## [18] train-rmse:41.733663+1.967301 test-rmse:41.793139+2.187208
## [19] train-rmse:40.998679+1.839151 test-rmse:41.054409+1.981375
## [20] train-rmse:39.682598+1.425809 test-rmse:39.735369+1.586714
## [21] train-rmse:38.904505+1.402720 test-rmse:38.943172+1.380490
## [22] train-rmse:37.740772+1.110022 test-rmse:37.778690+1.041240
## [23] train-rmse:36.940487+1.393524 test-rmse:36.980743+1.226831
## [24] train-rmse:35.712688+1.344147 test-rmse:35.753979+1.143263
## [25] train-rmse:35.202848+1.631906 test-rmse:35.250355+1.496182
## [26] train-rmse:34.344661+1.829583 test-rmse:34.389204+1.523998
## [27] train-rmse:33.331308+1.708773 test-rmse:33.384636+1.537119
## [28] train-rmse:32.601322+1.741031 test-rmse:32.647010+1.370052
## [29] train-rmse:32.030753+1.715564 test-rmse:32.089619+1.563183
## [30] train-rmse:31.371337+1.619875 test-rmse:31.442856+1.634927
## [31] train-rmse:30.454055+1.614446 test-rmse:30.531082+1.716651
## [32] train-rmse:29.951549+1.911215 test-rmse:30.031847+1.955978
## [33] train-rmse:29.125629+1.925990 test-rmse:29.216651+2.001208
## [34] train-rmse:28.362703+1.960172 test-rmse:28.461554+2.093729
## [35] train-rmse:28.028859+1.936583 test-rmse:28.125461+2.013657
## [36] train-rmse:27.417881+1.660393 test-rmse:27.518401+1.733058
## [37] train-rmse:26.985197+1.717643 test-rmse:27.084313+1.692132
## [38] train-rmse:26.578732+1.430717 test-rmse:26.687021+1.534495
## [39] train-rmse:26.251098+1.340034 test-rmse:26.357109+1.376575
## [40] train-rmse:25.806954+1.122894 test-rmse:25.911649+1.199642
## [41] train-rmse:25.244082+1.252438 test-rmse:25.356067+1.334437
## [42] train-rmse:24.745634+1.148779 test-rmse:24.864694+1.324762
## [43] train-rmse:24.299181+1.125123 test-rmse:24.422515+1.354251
## [44] train-rmse:23.964521+1.071489 test-rmse:24.098154+1.377127
## [45] train-rmse:23.517820+1.078028 test-rmse:23.650329+1.293446
## [46] train-rmse:23.303199+1.027667 test-rmse:23.445906+1.369477
## [47] train-rmse:22.994039+0.890609 test-rmse:23.129871+1.169562
## [48] train-rmse:22.557341+0.930225 test-rmse:22.699237+1.178580
## [49] train-rmse:22.253992+0.928062 test-rmse:22.395832+1.235226
## [50] train-rmse:21.773938+0.884540 test-rmse:21.916110+1.078190
## [51] train-rmse:21.359928+0.920151 test-rmse:21.506013+1.155020
## [52] train-rmse:21.246803+0.997746 test-rmse:21.392823+1.223455
## [53] train-rmse:21.111708+1.119717 test-rmse:21.259855+1.337181
## [54] train-rmse:20.836652+1.117059 test-rmse:20.985814+1.336396
## [55] train-rmse:20.703284+1.177471 test-rmse:20.849252+1.364993
## [56] train-rmse:20.464724+1.286973 test-rmse:20.614273+1.481153
## [57] train-rmse:20.041332+1.151340 test-rmse:20.206083+1.364809
## [58] train-rmse:19.749614+1.126183 test-rmse:19.915663+1.395039
## [59] train-rmse:19.496684+1.131648 test-rmse:19.659368+1.366849
## [60] train-rmse:19.371638+1.185437 test-rmse:19.539404+1.448636
## [61] train-rmse:18.959551+1.122090 test-rmse:19.128390+1.350801
## [62] train-rmse:18.876290+1.178157 test-rmse:19.046207+1.405915
## [63] train-rmse:18.564062+1.150144 test-rmse:18.735915+1.376422
## [64] train-rmse:18.266796+1.094644 test-rmse:18.442844+1.336064
## [65] train-rmse:17.963045+0.979500 test-rmse:18.140928+1.253338
## [66] train-rmse:17.715753+0.957250 test-rmse:17.892912+1.184003
## [67] train-rmse:17.499816+0.853499 test-rmse:17.680738+1.120640
## [68] train-rmse:17.358329+0.867539 test-rmse:17.541940+1.152322
## [69] train-rmse:17.198595+0.870387 test-rmse:17.386982+1.163704
## [70] train-rmse:17.052551+0.860202 test-rmse:17.242459+1.180460
## [71] train-rmse:16.947208+0.861727 test-rmse:17.137946+1.191236
## [72] train-rmse:16.875120+0.848341 test-rmse:17.065087+1.175685
## [73] train-rmse:16.718737+0.812504 test-rmse:16.917857+1.150183
## [74] train-rmse:16.555205+0.839413 test-rmse:16.753342+1.196134
## [75] train-rmse:16.449067+0.817016 test-rmse:16.645079+1.176282
## [76] train-rmse:16.319150+0.821313 test-rmse:16.519383+1.168047
## [77] train-rmse:16.171130+0.846390 test-rmse:16.376551+1.211002
## [78] train-rmse:16.035068+0.830742 test-rmse:16.248189+1.204612
## [79] train-rmse:15.892681+0.867631 test-rmse:16.110772+1.234937
## [80] train-rmse:15.708920+0.867126 test-rmse:15.934721+1.228622
## [81] train-rmse:15.498828+0.804373 test-rmse:15.727228+1.168941
## [82] train-rmse:15.308794+0.783506 test-rmse:15.533496+1.149043
## [83] train-rmse:15.219015+0.770153 test-rmse:15.441557+1.134210
## [84] train-rmse:15.027482+0.728806 test-rmse:15.251472+1.089898
## [85] train-rmse:14.888443+0.701976 test-rmse:15.115715+1.047146
## [86] train-rmse:14.773896+0.639976 test-rmse:14.999449+0.980189
## [87] train-rmse:14.618083+0.645314 test-rmse:14.848241+0.983271
## [88] train-rmse:14.527579+0.654372 test-rmse:14.759424+0.975085
## [89] train-rmse:14.408054+0.660338 test-rmse:14.640384+0.968095
## [90] train-rmse:14.294937+0.658424 test-rmse:14.523074+0.945605
## [91] train-rmse:14.153347+0.594116 test-rmse:14.381315+0.878431
## [92] train-rmse:14.045909+0.559802 test-rmse:14.278560+0.824898
## [93] train-rmse:13.921658+0.546825 test-rmse:14.158136+0.816067
## [94] train-rmse:13.837715+0.541879 test-rmse:14.075464+0.817687
## [95] train-rmse:13.784775+0.533811 test-rmse:14.021970+0.811563
## [96] train-rmse:13.733846+0.546830 test-rmse:13.970142+0.813271
## [97] train-rmse:13.651351+0.545428 test-rmse:13.894688+0.801871
## [98] train-rmse:13.560599+0.520556 test-rmse:13.802283+0.775158
## [99] train-rmse:13.426946+0.475212 test-rmse:13.672718+0.726234
## [100] train-rmse:13.308250+0.463842 test-rmse:13.549346+0.741427
## [101] train-rmse:13.215402+0.461362 test-rmse:13.456300+0.750187
## [102] train-rmse:13.130064+0.439550 test-rmse:13.371356+0.715389
## [103] train-rmse:13.072106+0.446331 test-rmse:13.317727+0.734271
## [104] train-rmse:13.018722+0.441255 test-rmse:13.268797+0.731916
## [105] train-rmse:12.942358+0.428125 test-rmse:13.192922+0.733786
## [106] train-rmse:12.891616+0.418396 test-rmse:13.143847+0.732341
## [107] train-rmse:12.837582+0.433064 test-rmse:13.089560+0.751399
## [108] train-rmse:12.743748+0.399644 test-rmse:12.997436+0.727625
## [109] train-rmse:12.690973+0.412301 test-rmse:12.945846+0.739899
## [110] train-rmse:12.651858+0.409110 test-rmse:12.906227+0.743594
## [111] train-rmse:12.566238+0.403905 test-rmse:12.822794+0.736171
## [112] train-rmse:12.521378+0.389322 test-rmse:12.779115+0.730042
## [113] train-rmse:12.490678+0.384063 test-rmse:12.747657+0.726155
## [114] train-rmse:12.442329+0.357870 test-rmse:12.701307+0.694044
## [115] train-rmse:12.395358+0.344879 test-rmse:12.657760+0.672183
## [116] train-rmse:12.370905+0.337244 test-rmse:12.636029+0.661169
## [117] train-rmse:12.312100+0.356418 test-rmse:12.580973+0.655359
## [118] train-rmse:12.241180+0.348380 test-rmse:12.512167+0.640872
## [119] train-rmse:12.202415+0.338332 test-rmse:12.474006+0.635763
## [120] train-rmse:12.166751+0.330310 test-rmse:12.439790+0.617169
## [121] train-rmse:12.114123+0.315740 test-rmse:12.386903+0.589415
## [122] train-rmse:12.075297+0.307016 test-rmse:12.348859+0.586133
## [123] train-rmse:12.008850+0.312077 test-rmse:12.284926+0.588376
## [124] train-rmse:11.927500+0.281543 test-rmse:12.203921+0.570579
## [125] train-rmse:11.866934+0.293521 test-rmse:12.144018+0.571543
## [126] train-rmse:11.807208+0.291227 test-rmse:12.084435+0.585336
## [127] train-rmse:11.760391+0.286373 test-rmse:12.035894+0.575901
## [128] train-rmse:11.717724+0.284048 test-rmse:11.995136+0.565307
## [129] train-rmse:11.679056+0.296256 test-rmse:11.957377+0.584220
## [130] train-rmse:11.634195+0.281800 test-rmse:11.911041+0.574768
## [131] train-rmse:11.594000+0.284716 test-rmse:11.873547+0.568645
## [132] train-rmse:11.555011+0.277994 test-rmse:11.833736+0.567530
## [133] train-rmse:11.515344+0.277920 test-rmse:11.795876+0.576864
## [134] train-rmse:11.470954+0.266291 test-rmse:11.751557+0.569849
## [135] train-rmse:11.430924+0.263766 test-rmse:11.710253+0.567225
## [136] train-rmse:11.397417+0.277372 test-rmse:11.675422+0.574469
## [137] train-rmse:11.380889+0.278505 test-rmse:11.659234+0.577216
## [138] train-rmse:11.351831+0.271880 test-rmse:11.630256+0.560806
## [139] train-rmse:11.307443+0.273585 test-rmse:11.587953+0.561272
## [140] train-rmse:11.272888+0.274491 test-rmse:11.554291+0.559348
## [141] train-rmse:11.244981+0.274483 test-rmse:11.529049+0.545898
## [142] train-rmse:11.209663+0.274223 test-rmse:11.495535+0.552692
## [143] train-rmse:11.161496+0.270217 test-rmse:11.447453+0.552400
## [144] train-rmse:11.122443+0.276070 test-rmse:11.408279+0.562051
## [145] train-rmse:11.093935+0.276443 test-rmse:11.381333+0.565862
## [146] train-rmse:11.046198+0.268312 test-rmse:11.335569+0.565482
## [147] train-rmse:11.013764+0.252605 test-rmse:11.303178+0.562111
## [148] train-rmse:10.980656+0.241410 test-rmse:11.268562+0.545482
## [149] train-rmse:10.947976+0.233810 test-rmse:11.236407+0.546195
## [150] train-rmse:10.898912+0.229192 test-rmse:11.188985+0.540844
## [151] train-rmse:10.869803+0.222678 test-rmse:11.160549+0.537110
## [152] train-rmse:10.847249+0.219519 test-rmse:11.137853+0.532324
## [153] train-rmse:10.814964+0.208642 test-rmse:11.107298+0.512058
## [154] train-rmse:10.791077+0.200110 test-rmse:11.085474+0.508884
## [155] train-rmse:10.764873+0.210425 test-rmse:11.058019+0.500876
## [156] train-rmse:10.738742+0.214985 test-rmse:11.031225+0.494808
## [157] train-rmse:10.706057+0.204146 test-rmse:10.998783+0.477529
## [158] train-rmse:10.677813+0.209446 test-rmse:10.972833+0.474366
## [159] train-rmse:10.656374+0.212735 test-rmse:10.952221+0.474602
## [160] train-rmse:10.613637+0.203139 test-rmse:10.911608+0.459421
## [161] train-rmse:10.598676+0.196652 test-rmse:10.897143+0.456877
## [162] train-rmse:10.565520+0.187652 test-rmse:10.863735+0.452187
## [163] train-rmse:10.532136+0.171342 test-rmse:10.834299+0.448851
## [164] train-rmse:10.515833+0.171246 test-rmse:10.817336+0.448886
## [165] train-rmse:10.489871+0.168680 test-rmse:10.790682+0.438450
## [166] train-rmse:10.463881+0.174288 test-rmse:10.768097+0.430730
## [167] train-rmse:10.431808+0.169665 test-rmse:10.737675+0.427054
## [168] train-rmse:10.413617+0.168289 test-rmse:10.719041+0.434992
## [169] train-rmse:10.400884+0.170443 test-rmse:10.707433+0.435743
## [170] train-rmse:10.381628+0.163730 test-rmse:10.689899+0.430353
## [171] train-rmse:10.357298+0.156236 test-rmse:10.664782+0.428333
## [172] train-rmse:10.340560+0.156245 test-rmse:10.648360+0.429387
## [173] train-rmse:10.324111+0.154571 test-rmse:10.633826+0.430920
## [174] train-rmse:10.304054+0.159555 test-rmse:10.611082+0.429386
## [175] train-rmse:10.292230+0.158318 test-rmse:10.600974+0.426826
## [176] train-rmse:10.271419+0.153204 test-rmse:10.580396+0.423563
## [177] train-rmse:10.250217+0.144801 test-rmse:10.560666+0.425360
## [178] train-rmse:10.236630+0.150307 test-rmse:10.546858+0.428558
## [179] train-rmse:10.219581+0.151138 test-rmse:10.530361+0.429606
## [180] train-rmse:10.203279+0.159297 test-rmse:10.516089+0.436302
## [181] train-rmse:10.170474+0.150174 test-rmse:10.484254+0.432712
## [182] train-rmse:10.150607+0.143100 test-rmse:10.466180+0.432907
## [183] train-rmse:10.128919+0.139769 test-rmse:10.447276+0.430366
## [184] train-rmse:10.116444+0.136392 test-rmse:10.434183+0.428852
## [185] train-rmse:10.103238+0.137503 test-rmse:10.421408+0.425913
## [186] train-rmse:10.079543+0.141208 test-rmse:10.398535+0.423641
## [187] train-rmse:10.061515+0.145919 test-rmse:10.380218+0.420702
## [188] train-rmse:10.043661+0.140927 test-rmse:10.364025+0.420200
## [189] train-rmse:10.030702+0.136669 test-rmse:10.351242+0.418083
## [190] train-rmse:10.013099+0.132077 test-rmse:10.334806+0.417904
## [191] train-rmse:10.003224+0.131046 test-rmse:10.325222+0.417186
## [192] train-rmse:9.977794+0.133473 test-rmse:10.303993+0.409037
## [193] train-rmse:9.955110+0.127801 test-rmse:10.281610+0.414874
## [194] train-rmse:9.937767+0.123408 test-rmse:10.264488+0.410173
## [195] train-rmse:9.920471+0.119351 test-rmse:10.248803+0.402870
## [196] train-rmse:9.904807+0.119572 test-rmse:10.232660+0.404553
## [197] train-rmse:9.894151+0.118215 test-rmse:10.223174+0.406781
## [198] train-rmse:9.877639+0.119703 test-rmse:10.207268+0.411335
## [199] train-rmse:9.860826+0.124701 test-rmse:10.190674+0.413868
## [200] train-rmse:9.851560+0.122998 test-rmse:10.181444+0.415317
## [201] train-rmse:9.836962+0.122353 test-rmse:10.168148+0.414157
## [202] train-rmse:9.821500+0.126547 test-rmse:10.152826+0.413544
## [203] train-rmse:9.808001+0.120111 test-rmse:10.139671+0.416198
## [204] train-rmse:9.789120+0.120033 test-rmse:10.120445+0.409768
## [205] train-rmse:9.770634+0.111066 test-rmse:10.101590+0.413240
## [206] train-rmse:9.756057+0.113862 test-rmse:10.087464+0.410996
## [207] train-rmse:9.740654+0.108818 test-rmse:10.073008+0.413456
## [208] train-rmse:9.729500+0.108962 test-rmse:10.059912+0.405195
## [209] train-rmse:9.713300+0.105780 test-rmse:10.045207+0.405790
## [210] train-rmse:9.701051+0.106585 test-rmse:10.034742+0.410114
## [211] train-rmse:9.682118+0.106183 test-rmse:10.017781+0.404554
## [212] train-rmse:9.673854+0.103980 test-rmse:10.010321+0.405675
## [213] train-rmse:9.654135+0.100130 test-rmse:9.992298+0.406646
## [214] train-rmse:9.640481+0.102486 test-rmse:9.979878+0.404457
## [215] train-rmse:9.632062+0.104691 test-rmse:9.971423+0.405219
## [216] train-rmse:9.621026+0.101507 test-rmse:9.962105+0.406768
## [217] train-rmse:9.608985+0.099291 test-rmse:9.950140+0.404004
## [218] train-rmse:9.594721+0.104760 test-rmse:9.936102+0.401648
## [219] train-rmse:9.582296+0.103846 test-rmse:9.922834+0.399677
## [220] train-rmse:9.567258+0.100433 test-rmse:9.910855+0.402802
## [221] train-rmse:9.558158+0.102900 test-rmse:9.902499+0.404595
## [222] train-rmse:9.542580+0.107015 test-rmse:9.888357+0.400432
## [223] train-rmse:9.523993+0.107321 test-rmse:9.871519+0.388932
## [224] train-rmse:9.507012+0.114603 test-rmse:9.856031+0.382138
## [225] train-rmse:9.499607+0.113914 test-rmse:9.848538+0.383890
## [226] train-rmse:9.487122+0.115743 test-rmse:9.835279+0.381230
## [227] train-rmse:9.477153+0.117298 test-rmse:9.824331+0.382217
## [228] train-rmse:9.468409+0.117378 test-rmse:9.816470+0.382314
## [229] train-rmse:9.456718+0.116577 test-rmse:9.805238+0.383098
## [230] train-rmse:9.443708+0.116666 test-rmse:9.792541+0.381910
## [231] train-rmse:9.434401+0.122522 test-rmse:9.783843+0.384327
## [232] train-rmse:9.424836+0.123293 test-rmse:9.774644+0.381195
## [233] train-rmse:9.417189+0.125157 test-rmse:9.766897+0.382082
## [234] train-rmse:9.404531+0.128939 test-rmse:9.754988+0.380096
## [235] train-rmse:9.385700+0.135677 test-rmse:9.736225+0.384938
## [236] train-rmse:9.380681+0.136036 test-rmse:9.731155+0.385226
## [237] train-rmse:9.369075+0.131875 test-rmse:9.721336+0.385930
## [238] train-rmse:9.356988+0.128752 test-rmse:9.709838+0.389196
## [239] train-rmse:9.347853+0.125199 test-rmse:9.701602+0.388253
## [240] train-rmse:9.343640+0.124666 test-rmse:9.697530+0.388757
## [241] train-rmse:9.332742+0.128993 test-rmse:9.686036+0.388375
## [242] train-rmse:9.322995+0.130996 test-rmse:9.676602+0.390244
## [243] train-rmse:9.315816+0.131785 test-rmse:9.669880+0.388172
## [244] train-rmse:9.305006+0.136571 test-rmse:9.658989+0.391856
## [245] train-rmse:9.298165+0.134730 test-rmse:9.652405+0.392567
## [246] train-rmse:9.290600+0.134522 test-rmse:9.645570+0.395199
## [247] train-rmse:9.281089+0.136824 test-rmse:9.637397+0.396009
## [248] train-rmse:9.266041+0.134092 test-rmse:9.624377+0.392818
## [249] train-rmse:9.259023+0.134405 test-rmse:9.617109+0.390630
## [250] train-rmse:9.242450+0.128554 test-rmse:9.600505+0.390931
## [251] train-rmse:9.234533+0.128259 test-rmse:9.593570+0.393001
## [252] train-rmse:9.220228+0.126664 test-rmse:9.579589+0.389928
## [253] train-rmse:9.212157+0.128353 test-rmse:9.572581+0.386242
## [254] train-rmse:9.205359+0.128029 test-rmse:9.566911+0.387810
## [255] train-rmse:9.199803+0.128730 test-rmse:9.562516+0.386857
## [256] train-rmse:9.192243+0.131254 test-rmse:9.555527+0.386119
## [257] train-rmse:9.180939+0.124446 test-rmse:9.545549+0.387471
## [258] train-rmse:9.173330+0.127704 test-rmse:9.538570+0.386550
## [259] train-rmse:9.165204+0.127079 test-rmse:9.528723+0.379746
## [260] train-rmse:9.157089+0.127403 test-rmse:9.521460+0.377256
## [261] train-rmse:9.149003+0.128913 test-rmse:9.512727+0.372564
## [262] train-rmse:9.136132+0.125149 test-rmse:9.500968+0.377840
## [263] train-rmse:9.127902+0.126763 test-rmse:9.493249+0.377592
## [264] train-rmse:9.123005+0.126542 test-rmse:9.488505+0.377370
## [265] train-rmse:9.117965+0.127221 test-rmse:9.481694+0.370797
## [266] train-rmse:9.109887+0.124306 test-rmse:9.473039+0.369011
## [267] train-rmse:9.104673+0.123309 test-rmse:9.467989+0.368060
## [268] train-rmse:9.092086+0.125318 test-rmse:9.457427+0.360778
## [269] train-rmse:9.077100+0.126328 test-rmse:9.443831+0.362886
## [270] train-rmse:9.071003+0.126513 test-rmse:9.437746+0.363740
## [271] train-rmse:9.064710+0.123078 test-rmse:9.431797+0.362664
## [272] train-rmse:9.057125+0.119671 test-rmse:9.425193+0.365882
## [273] train-rmse:9.048040+0.119653 test-rmse:9.417839+0.364476
## [274] train-rmse:9.038809+0.115257 test-rmse:9.409662+0.365498
## [275] train-rmse:9.033037+0.117989 test-rmse:9.404609+0.363848
## [276] train-rmse:9.028031+0.116568 test-rmse:9.399984+0.364466
## [277] train-rmse:9.020843+0.115073 test-rmse:9.391625+0.359804
## [278] train-rmse:9.013157+0.117254 test-rmse:9.384618+0.358877
## [279] train-rmse:9.007279+0.118086 test-rmse:9.379630+0.361089
## [280] train-rmse:9.002462+0.117524 test-rmse:9.373640+0.358969
## [281] train-rmse:8.993344+0.118662 test-rmse:9.365366+0.355980
## [282] train-rmse:8.983434+0.119871 test-rmse:9.355912+0.355419
## [283] train-rmse:8.977119+0.118818 test-rmse:9.348593+0.354322
## [284] train-rmse:8.970545+0.118439 test-rmse:9.342851+0.353287
## [285] train-rmse:8.965399+0.118460 test-rmse:9.338003+0.352122
## [286] train-rmse:8.956637+0.118442 test-rmse:9.329433+0.348805
## [287] train-rmse:8.947091+0.124131 test-rmse:9.319769+0.344848
## [288] train-rmse:8.942395+0.124864 test-rmse:9.315539+0.345110
## [289] train-rmse:8.936625+0.124930 test-rmse:9.309399+0.342654
## [290] train-rmse:8.930326+0.124412 test-rmse:9.303534+0.343498
## [291] train-rmse:8.924509+0.123912 test-rmse:9.298911+0.342461
## [292] train-rmse:8.918680+0.127292 test-rmse:9.293353+0.339355
## [293] train-rmse:8.909154+0.127753 test-rmse:9.284326+0.341114
## [294] train-rmse:8.897689+0.124022 test-rmse:9.272688+0.342353
## [295] train-rmse:8.891725+0.126154 test-rmse:9.266887+0.344899
## [296] train-rmse:8.887268+0.124457 test-rmse:9.262222+0.344258
## [297] train-rmse:8.877921+0.124734 test-rmse:9.253354+0.340558
## [298] train-rmse:8.868206+0.125806 test-rmse:9.243345+0.334405
## [299] train-rmse:8.863985+0.124192 test-rmse:9.238774+0.335266
## [300] train-rmse:8.855446+0.122762 test-rmse:9.231752+0.337541
## [301] train-rmse:8.848531+0.126742 test-rmse:9.225046+0.339937
## [302] train-rmse:8.842111+0.123371 test-rmse:9.220187+0.339654
## [303] train-rmse:8.833475+0.122159 test-rmse:9.211904+0.342848
## [304] train-rmse:8.824214+0.118498 test-rmse:9.201571+0.345484
## [305] train-rmse:8.819610+0.117201 test-rmse:9.197223+0.347339
## [306] train-rmse:8.814282+0.115841 test-rmse:9.192321+0.348523
## [307] train-rmse:8.806472+0.121544 test-rmse:9.185030+0.347505
## [308] train-rmse:8.801235+0.119215 test-rmse:9.181076+0.348473
## [309] train-rmse:8.797045+0.118628 test-rmse:9.177992+0.349114
## [310] train-rmse:8.792905+0.117712 test-rmse:9.174512+0.349104
## [311] train-rmse:8.788173+0.119693 test-rmse:9.169930+0.349259
## [312] train-rmse:8.780563+0.119018 test-rmse:9.162828+0.348043
## [313] train-rmse:8.769463+0.115788 test-rmse:9.151635+0.360224
## [314] train-rmse:8.758913+0.112986 test-rmse:9.141717+0.367362
## [315] train-rmse:8.749969+0.111973 test-rmse:9.131876+0.372262
## [316] train-rmse:8.742927+0.112606 test-rmse:9.126892+0.370611
## [317] train-rmse:8.737209+0.111397 test-rmse:9.121566+0.372515
## [318] train-rmse:8.729966+0.108540 test-rmse:9.115212+0.372855
## [319] train-rmse:8.727226+0.107770 test-rmse:9.113017+0.373090
## [320] train-rmse:8.723138+0.105875 test-rmse:9.109239+0.372929
## [321] train-rmse:8.716674+0.101418 test-rmse:9.102372+0.371024
## [322] train-rmse:8.713060+0.102855 test-rmse:9.099127+0.369099
## [323] train-rmse:8.709337+0.104452 test-rmse:9.096110+0.368832
## [324] train-rmse:8.700069+0.100996 test-rmse:9.086214+0.365149
## [325] train-rmse:8.689189+0.099130 test-rmse:9.072684+0.366924
## [326] train-rmse:8.684540+0.101045 test-rmse:9.067893+0.368995
## [327] train-rmse:8.681166+0.102288 test-rmse:9.065271+0.369636
## [328] train-rmse:8.675911+0.102319 test-rmse:9.059807+0.371351
## [329] train-rmse:8.670249+0.103454 test-rmse:9.053705+0.372845
## [330] train-rmse:8.656713+0.101649 test-rmse:9.040555+0.371421
## [331] train-rmse:8.651829+0.105010 test-rmse:9.036664+0.370880
## [332] train-rmse:8.642512+0.101655 test-rmse:9.028887+0.370997
## [333] train-rmse:8.637955+0.101776 test-rmse:9.024683+0.369508
## [334] train-rmse:8.631709+0.104609 test-rmse:9.017291+0.363328
## [335] train-rmse:8.624763+0.107185 test-rmse:9.009563+0.362530
## [336] train-rmse:8.621722+0.107305 test-rmse:9.006493+0.362443
## [337] train-rmse:8.614239+0.108263 test-rmse:9.000021+0.362081
## [338] train-rmse:8.612366+0.107792 test-rmse:8.998311+0.362194
## [339] train-rmse:8.603698+0.105506 test-rmse:8.989673+0.366931
## [340] train-rmse:8.594845+0.106247 test-rmse:8.982219+0.364170
## [341] train-rmse:8.582654+0.109504 test-rmse:8.970789+0.364589
## [342] train-rmse:8.577492+0.109999 test-rmse:8.966339+0.363695
## [343] train-rmse:8.568043+0.110500 test-rmse:8.956634+0.363421
## [344] train-rmse:8.564476+0.109676 test-rmse:8.954390+0.363080
## [345] train-rmse:8.556497+0.107998 test-rmse:8.947262+0.360155
## [346] train-rmse:8.549809+0.108851 test-rmse:8.941726+0.358756
## [347] train-rmse:8.545305+0.106709 test-rmse:8.937196+0.358693
## [348] train-rmse:8.539340+0.110751 test-rmse:8.930907+0.356349
## [349] train-rmse:8.535605+0.109778 test-rmse:8.927714+0.358485
## [350] train-rmse:8.527952+0.106946 test-rmse:8.920723+0.357695
## [351] train-rmse:8.522131+0.108023 test-rmse:8.913847+0.353007
## [352] train-rmse:8.514757+0.104509 test-rmse:8.907511+0.350396
## [353] train-rmse:8.508233+0.106299 test-rmse:8.898559+0.345045
## [354] train-rmse:8.504391+0.105023 test-rmse:8.895302+0.345687
## [355] train-rmse:8.502040+0.105116 test-rmse:8.893383+0.346496
## [356] train-rmse:8.493789+0.111683 test-rmse:8.886837+0.341131
## [357] train-rmse:8.487276+0.109294 test-rmse:8.880791+0.340058
## [358] train-rmse:8.484624+0.109270 test-rmse:8.879358+0.340268
## [359] train-rmse:8.481388+0.108789 test-rmse:8.875801+0.340265
## [360] train-rmse:8.472115+0.113875 test-rmse:8.867973+0.334363
## [361] train-rmse:8.467550+0.112128 test-rmse:8.864451+0.334259
## [362] train-rmse:8.465397+0.112407 test-rmse:8.863079+0.334470
## [363] train-rmse:8.459870+0.111300 test-rmse:8.857651+0.339148
## [364] train-rmse:8.454519+0.109234 test-rmse:8.852782+0.337630
## [365] train-rmse:8.449811+0.111535 test-rmse:8.848550+0.335505
## [366] train-rmse:8.441154+0.117538 test-rmse:8.839529+0.336672
## [367] train-rmse:8.437111+0.118444 test-rmse:8.835668+0.335652
## [368] train-rmse:8.430080+0.114298 test-rmse:8.829018+0.335797
## [369] train-rmse:8.425472+0.112264 test-rmse:8.825060+0.337070
## [370] train-rmse:8.419826+0.108449 test-rmse:8.819821+0.336932
## [371] train-rmse:8.414648+0.106507 test-rmse:8.815369+0.336799
## [372] train-rmse:8.410266+0.108397 test-rmse:8.811965+0.336446
## [373] train-rmse:8.404494+0.110066 test-rmse:8.806865+0.335359
## [374] train-rmse:8.398760+0.111229 test-rmse:8.802126+0.332824
## [375] train-rmse:8.394882+0.113068 test-rmse:8.797074+0.328081
## [376] train-rmse:8.388706+0.111188 test-rmse:8.791768+0.328663
## [377] train-rmse:8.380818+0.106269 test-rmse:8.784194+0.331310
## [378] train-rmse:8.376018+0.106524 test-rmse:8.780105+0.329733
## [379] train-rmse:8.372654+0.106806 test-rmse:8.777394+0.330116
## [380] train-rmse:8.365086+0.108462 test-rmse:8.769355+0.329771
## [381] train-rmse:8.360022+0.107402 test-rmse:8.765104+0.332241
## [382] train-rmse:8.356217+0.109956 test-rmse:8.761511+0.330954
## [383] train-rmse:8.351346+0.110337 test-rmse:8.757184+0.330023
## [384] train-rmse:8.348650+0.110514 test-rmse:8.754745+0.328900
## [385] train-rmse:8.345124+0.111960 test-rmse:8.751946+0.328842
## [386] train-rmse:8.341696+0.111095 test-rmse:8.749252+0.327808
## [387] train-rmse:8.333018+0.107681 test-rmse:8.742530+0.326593
## [388] train-rmse:8.326462+0.114535 test-rmse:8.737281+0.321928
## [389] train-rmse:8.321180+0.113817 test-rmse:8.732704+0.321114
## [390] train-rmse:8.314883+0.111216 test-rmse:8.724883+0.317757
## [391] train-rmse:8.311658+0.109820 test-rmse:8.722228+0.317511
## [392] train-rmse:8.307268+0.109485 test-rmse:8.718089+0.317679
## [393] train-rmse:8.299887+0.106847 test-rmse:8.711557+0.321601
## [394] train-rmse:8.296377+0.108840 test-rmse:8.708518+0.319477
## [395] train-rmse:8.284252+0.101986 test-rmse:8.696319+0.318920
## [396] train-rmse:8.277013+0.098498 test-rmse:8.689118+0.317924
## [397] train-rmse:8.271712+0.100596 test-rmse:8.684471+0.317809
## [398] train-rmse:8.266528+0.098587 test-rmse:8.678899+0.316782
## [399] train-rmse:8.262553+0.097379 test-rmse:8.674878+0.315652
## [400] train-rmse:8.255024+0.104937 test-rmse:8.668643+0.311825
## [401] train-rmse:8.250928+0.105900 test-rmse:8.665632+0.310449
## [402] train-rmse:8.246820+0.104919 test-rmse:8.661500+0.311074
## [403] train-rmse:8.240763+0.102843 test-rmse:8.656542+0.316113
## [404] train-rmse:8.236864+0.103735 test-rmse:8.652959+0.314765
## [405] train-rmse:8.234394+0.104069 test-rmse:8.651131+0.314362
## [406] train-rmse:8.231032+0.104143 test-rmse:8.648427+0.313337
## [407] train-rmse:8.223114+0.104787 test-rmse:8.642043+0.317244
## [408] train-rmse:8.215830+0.103464 test-rmse:8.635102+0.315294
## [409] train-rmse:8.211355+0.103566 test-rmse:8.630602+0.318304
## [410] train-rmse:8.208338+0.104152 test-rmse:8.627650+0.319815
## [411] train-rmse:8.200743+0.105829 test-rmse:8.620149+0.316057
## [412] train-rmse:8.194809+0.105608 test-rmse:8.613854+0.315495
## [413] train-rmse:8.189666+0.107870 test-rmse:8.607138+0.311749
## [414] train-rmse:8.185607+0.109902 test-rmse:8.603459+0.310936
## [415] train-rmse:8.182045+0.110686 test-rmse:8.600648+0.310870
## [416] train-rmse:8.179227+0.109856 test-rmse:8.598332+0.309993
## [417] train-rmse:8.174266+0.109494 test-rmse:8.592953+0.308735
## [418] train-rmse:8.167821+0.108815 test-rmse:8.587512+0.309684
## [419] train-rmse:8.161695+0.107251 test-rmse:8.581133+0.308699
## [420] train-rmse:8.157865+0.106188 test-rmse:8.577647+0.307995
## [421] train-rmse:8.154206+0.105058 test-rmse:8.574034+0.307711
## [422] train-rmse:8.152153+0.104021 test-rmse:8.572143+0.309135
## [423] train-rmse:8.148262+0.103377 test-rmse:8.569415+0.308855
## [424] train-rmse:8.143459+0.100853 test-rmse:8.565144+0.311188
## [425] train-rmse:8.139362+0.100260 test-rmse:8.561183+0.312183
## [426] train-rmse:8.137227+0.099973 test-rmse:8.559392+0.312494
## [427] train-rmse:8.134416+0.098866 test-rmse:8.557185+0.313116
## [428] train-rmse:8.130422+0.101029 test-rmse:8.551822+0.308770
## [429] train-rmse:8.125710+0.100204 test-rmse:8.548025+0.307216
## [430] train-rmse:8.122021+0.100741 test-rmse:8.543457+0.304219
## [431] train-rmse:8.118519+0.102938 test-rmse:8.538539+0.301268
## [432] train-rmse:8.115112+0.105820 test-rmse:8.534198+0.298014
## [433] train-rmse:8.109576+0.106966 test-rmse:8.529557+0.297208
## [434] train-rmse:8.104453+0.105182 test-rmse:8.525045+0.300921
## [435] train-rmse:8.102312+0.104614 test-rmse:8.523535+0.300780
## [436] train-rmse:8.099404+0.105157 test-rmse:8.521761+0.298555
## [437] train-rmse:8.097651+0.105387 test-rmse:8.519973+0.298603
## [438] train-rmse:8.092798+0.107474 test-rmse:8.515911+0.298282
## [439] train-rmse:8.087656+0.105214 test-rmse:8.511749+0.297680
## [440] train-rmse:8.083394+0.103154 test-rmse:8.508649+0.298580
## [441] train-rmse:8.080258+0.103120 test-rmse:8.505164+0.297517
## [442] train-rmse:8.076874+0.105540 test-rmse:8.502965+0.295661
## [443] train-rmse:8.073629+0.106265 test-rmse:8.500793+0.295514
## [444] train-rmse:8.069715+0.106382 test-rmse:8.497350+0.297009
## [445] train-rmse:8.066160+0.106491 test-rmse:8.494838+0.295578
## [446] train-rmse:8.063038+0.106727 test-rmse:8.492668+0.295896
## [447] train-rmse:8.058164+0.107094 test-rmse:8.487546+0.296164
## [448] train-rmse:8.054766+0.108186 test-rmse:8.485156+0.295207
## [449] train-rmse:8.052449+0.108331 test-rmse:8.483192+0.294671
## [450] train-rmse:8.048032+0.108651 test-rmse:8.479274+0.295044
## [451] train-rmse:8.045126+0.109984 test-rmse:8.477350+0.294835
## [452] train-rmse:8.038511+0.112459 test-rmse:8.470843+0.295767
## [453] train-rmse:8.033699+0.110321 test-rmse:8.465756+0.294978
## [454] train-rmse:8.027253+0.108786 test-rmse:8.459952+0.294763
## [455] train-rmse:8.023795+0.107541 test-rmse:8.455992+0.294531
## [456] train-rmse:8.020175+0.109345 test-rmse:8.453409+0.293265
## [457] train-rmse:8.014451+0.112010 test-rmse:8.447573+0.292246
## [458] train-rmse:8.012392+0.111272 test-rmse:8.446089+0.291898
## [459] train-rmse:8.008706+0.111259 test-rmse:8.443013+0.291223
## [460] train-rmse:8.005024+0.112409 test-rmse:8.440601+0.291097
## [461] train-rmse:8.001924+0.112444 test-rmse:8.437531+0.291513
## [462] train-rmse:7.998080+0.113186 test-rmse:8.432672+0.289153
## [463] train-rmse:7.994006+0.113843 test-rmse:8.429280+0.289369
## [464] train-rmse:7.987688+0.113642 test-rmse:8.424036+0.291415
## [465] train-rmse:7.984455+0.112733 test-rmse:8.420100+0.289553
## [466] train-rmse:7.979910+0.112689 test-rmse:8.416054+0.290919
## [467] train-rmse:7.976199+0.110366 test-rmse:8.412339+0.289246
## [468] train-rmse:7.973880+0.110837 test-rmse:8.410325+0.289415
## [469] train-rmse:7.970769+0.111544 test-rmse:8.407548+0.289310
## [470] train-rmse:7.967203+0.111272 test-rmse:8.404503+0.288580
## [471] train-rmse:7.963899+0.110897 test-rmse:8.402371+0.288409
## [472] train-rmse:7.960302+0.110280 test-rmse:8.399463+0.290735
## [473] train-rmse:7.953855+0.111236 test-rmse:8.393564+0.289393
## [474] train-rmse:7.949619+0.113152 test-rmse:8.390611+0.288573
## [475] train-rmse:7.944849+0.111194 test-rmse:8.385507+0.286848
## [476] train-rmse:7.941335+0.110653 test-rmse:8.383277+0.288130
## [477] train-rmse:7.938676+0.111717 test-rmse:8.381802+0.286679
## [478] train-rmse:7.934691+0.114084 test-rmse:8.379222+0.285194
## [479] train-rmse:7.932671+0.113550 test-rmse:8.377414+0.285567
## [480] train-rmse:7.930535+0.114241 test-rmse:8.376175+0.285055
## [481] train-rmse:7.926017+0.114892 test-rmse:8.370511+0.281872
## [482] train-rmse:7.920363+0.116034 test-rmse:8.365403+0.281006
## [483] train-rmse:7.916318+0.116981 test-rmse:8.361843+0.282089
## [484] train-rmse:7.911390+0.115963 test-rmse:8.358424+0.284879
## [485] train-rmse:7.909783+0.115574 test-rmse:8.357141+0.285639
## [486] train-rmse:7.905772+0.113984 test-rmse:8.352969+0.287693
## [487] train-rmse:7.903043+0.114656 test-rmse:8.350924+0.287935
## [488] train-rmse:7.898748+0.115707 test-rmse:8.347681+0.286642
## [489] train-rmse:7.895827+0.115516 test-rmse:8.344660+0.286553
## [490] train-rmse:7.894121+0.116428 test-rmse:8.343343+0.286141
## [491] train-rmse:7.890137+0.118835 test-rmse:8.340771+0.285379
## [492] train-rmse:7.884099+0.119281 test-rmse:8.335552+0.285053
## [493] train-rmse:7.879485+0.119768 test-rmse:8.331183+0.286299
## [494] train-rmse:7.876964+0.119746 test-rmse:8.328944+0.287170
## [495] train-rmse:7.872779+0.117897 test-rmse:8.324450+0.291334
## [496] train-rmse:7.869536+0.117146 test-rmse:8.321783+0.291531
## [497] train-rmse:7.866731+0.118037 test-rmse:8.319353+0.290080
## [498] train-rmse:7.865417+0.117851 test-rmse:8.318216+0.289873
## [499] train-rmse:7.856918+0.115144 test-rmse:8.309280+0.285302
## [500] train-rmse:7.853476+0.115487 test-rmse:8.306489+0.284709
best_nrounds <- cv_results$best_iteration
# Train the final model using the best number of rounds found
model_xgb <- xgb.train(
params = params,
data = dtrain,
nrounds = best_nrounds
)
# Make predictions and evaluate the model
train_pred <- predict(model_xgb, dtrain)
test_pred <- predict(model_xgb, dtest)
train_rmse <- sqrt(mean((train_labels - train_pred)^2))
test_rmse <- sqrt(mean((test_labels - test_pred)^2))
# Calculate R-squared for the training set
sst_train <- sum((train_labels - mean(train_labels)) ^ 2)
ssr_train <- sum((train_labels - train_pred) ^ 2)
r_squared_train <- 1 - (ssr_train / sst_train)
# Calculate R-squared for the test set
sst_test <- sum((test_labels - mean(test_labels)) ^ 2)
ssr_test <- sum((test_labels - test_pred) ^ 2)
r_squared_test <- 1 - (ssr_test / sst_test)
train_mape <- mean(abs((train_labels - train_pred) / train_labels)) * 100
test_mape <- mean(abs((test_labels - test_pred) / test_labels)) * 100
train_mae <- mean(abs(train_labels - train_pred))
test_mae <- mean(abs(test_labels - test_pred))
cat("Model Performance Metrics:\n",
"--------------------------\n",
"Training RMSE: ", train_rmse, "\n",
"Test RMSE: ", test_rmse, "\n",
"Training R-squared: ", r_squared_train, "\n",
"Test R-squared: ", r_squared_test, "\n",
"Training MAE: ", train_mae, "\n",
"Test MAE: ", test_mae, "\n",
"Training MAPE: ", train_mape, "%\n",
"Test MAPE: ", test_mape, "%\n", sep="")
## Model Performance Metrics:
## --------------------------
## Training RMSE: 7.933109
## Test RMSE: 8.062598
## Training R-squared: 0.9824703
## Test R-squared: 0.9816268
## Training MAE: 4.424568
## Test MAE: 4.50928
## Training MAPE: 28.69924%
## Test MAPE: 29.15366%
# Correcting Residuals Data Frame
# Assuming 'train_labels' and 'test_labels' contain the actual values,
# and 'train_pred' and 'test_pred' contain your model's predictions:
residuals_train <- train_labels - train_pred
residuals_test <- test_labels - test_pred
residuals_data <- data.frame(
Residuals = c(residuals_train, residuals_test),
Dataset = c(rep('Training', length(residuals_train)), rep('Test', length(residuals_test)))
)
# Now plotting residuals with corrected data
ggplot(residuals_data, aes(x = Residuals, fill = Dataset)) +
geom_histogram(binwidth = 1, position = 'identity', alpha = 0.6) +
facet_wrap(~ Dataset) +
ggtitle('Residuals Distribution')
# Assuming train_labels, test_labels, train_pred, and test_pred are correctly defined
# Adjusted Actual vs. Predicted Data Preparation
actual_pred_data <- data.frame(
Actual = c(train_labels, test_labels),
Predicted = c(train_pred, test_pred),
Dataset = c(rep('Training', length(train_labels)), rep('Test', length(test_labels)))
)
# Plotting Actual vs. Predicted Values
ggplot(actual_pred_data, aes(x = Actual, y = Predicted, colour = Dataset)) +
geom_point(alpha = 0.6) +
geom_abline(intercept = 0, slope = 1, linetype = 'dashed', color = 'red') +
xlab('Actual Values') +
ylab('Predicted Values') +
scale_colour_manual(values = c('Training' = 'blue', 'Test' = 'red')) +
ggtitle('Actual vs. Predicted Values')
library(xgboost)
# Calculate feature importance
importance_matrix <- xgb.importance(feature_names = colnames(train_features), model = model_xgb)
# View the feature importance scores
print(importance_matrix)
## Feature Gain Cover Frequency
## <char> <num> <num> <num>
## 1: DOLLAR_SALES 7.250497e-01 3.190890e-01 0.2587792642
## 2: 12SMALL 12ONE CUP 1.111782e-01 7.959344e-02 0.0592251951
## 3: 12SMALL 24ONE CUP 4.425131e-02 5.997315e-02 0.0386008919
## 4: WEEKS_SINCE_LAUNCH 2.499099e-02 1.157388e-01 0.1399108138
## 5: 20SMALL MULTI JUG 1.713063e-02 3.603165e-02 0.0381828317
## 6: POP_SQMI 8.908652e-03 5.965322e-02 0.1082775920
## 7: 12SMALL MLT PLASTICS JUG 8.846117e-03 1.978074e-02 0.0202062430
## 8: REGION_ARIZONA 7.742566e-03 9.364009e-03 0.0257803790
## 9: ITEM_BEAUTIFUL GREENER PLUM 6.940521e-03 2.274741e-02 0.0274526198
## 10: 12SMALL 18ONE CUP 6.188074e-03 3.062063e-02 0.0218784838
## 11: ITEM_RAINING THRASHED PLUM 5.649968e-03 1.197710e-02 0.0209030100
## 12: REGION_SOCAL 5.218939e-03 8.922647e-03 0.0197881828
## 13: .5L 6ONE JUG 3.726497e-03 1.397857e-02 0.0119843924
## 14: BRAND_BEAUTIFUL GREENER 3.409390e-03 8.407059e-03 0.0096153846
## 15: CALORIC_SEGMENT 3.301499e-03 7.090121e-03 0.0089186176
## 16: REGION_MOUNTAIN 2.358481e-03 1.632401e-02 0.0091973244
## 17: ITEM_ZIZZLES PLUM 2.354024e-03 8.482146e-03 0.0083612040
## 18: 12SMALL 20ONE CUP 2.275258e-03 2.722203e-02 0.0137959866
## 19: WEEK_OF_YEAR 2.089544e-03 4.534149e-02 0.0728818283
## 20: BRAND_SINGLE GROUP 1.860115e-03 3.376159e-03 0.0036231884
## 21: ITEM_ SMASH SUNSET 1.568908e-03 3.105690e-03 0.0032051282
## 22: REGION_KANSAS 9.756360e-04 2.029609e-02 0.0126811594
## 23: REGION_COLORADO 8.989604e-04 7.888115e-03 0.0100334448
## 24: REGION_CALI_NEVADA 7.861451e-04 9.748476e-03 0.0048773690
## 25: 12SMALL 24ONE PLASTICS JUG 6.618125e-04 2.513283e-02 0.0089186176
## 26: BRAND_HILL MOISTURE THRASHED APPLE 5.271685e-04 3.925149e-03 0.0066889632
## 27: REGION_NORTHERN 4.438187e-04 4.679586e-03 0.0085005574
## 28: REGION_DESERT_SW 4.156559e-04 1.583096e-02 0.0147714604
## 29: BRAND_DIET SMASH 1.136502e-04 1.033363e-03 0.0018115942
## 30: REGION_NOCAL 6.921370e-05 1.233931e-03 0.0062709030
## 31: REGION_NEWMEXICO 4.339678e-05 4.059228e-05 0.0018115942
## 32: REGION_PRAIRIE 2.111430e-05 3.943581e-04 0.0018115942
## 33: 12SMALL 6ONE CUP 3.414952e-06 2.924409e-03 0.0009754738
## 34: ITEM_GO-DAY 3.013295e-07 2.260741e-05 0.0001393534
## 35: BRAND_GO-DAY 2.776568e-07 3.049391e-05 0.0001393534
## Feature Gain Cover Frequency
# Plot the feature importance
xgb.plot.importance(importance_matrix = importance_matrix)
# Compute partial dependence data for 'DOLLAR_SALES' and 'plum', CALORIC_SEGMENT, and "ENERGY
# pd <- partial(model_xgb, pred.var = c("DOLLAR_SALES", "plum", "CALORIC_SEGMENT", ENERGY"), train = train_features, grid.resolution = 20)
#
# # Default PDP
# pdp1 <- plotPartial(pd, plot = TRUE)
#
# # Add contour lines and use a different color palette
# rwb <- colorRampPalette(c("red", "white", "blue"))
# pdp2 <- plotPartial(pd, contour = TRUE, col.regions = rwb)
#
# # 3-D surface
# pdp3 <- plotPartial(pd, levelplot = FALSE, zlab = "Predicted Outcome", drape = TRUE, colorkey = TRUE, screen = list(z = -20, x = -60))
#
# # Combine plots into one window
# grid.arrange(pdp1, pdp2, pdp3, ncol = 3)
Model with NO DOLLAR SALES Variable
# Assuming 'df' is your complete dataframe and 'UNIT_SALES' is your target variable
df2 <- df
# Remove DOLLAR_SALES from the features
df2$DOLLAR_SALES <- NULL
# Split the updated data into training and testing sets (assuming you're using a similar approach as before)
set.seed(123)
df2_testtrn <- initial_split(df2, prop = 0.8, strata = UNIT_SALES)
Train <- training(df2_testtrn)
Test <- testing(df2_testtrn)
# Prepare features and labels for XGBoost, excluding DOLLAR_SALES
train_features <- Train[, -which(names(Train) == "UNIT_SALES")]
train_labels <- Train$UNIT_SALES
test_features <- Test[, -which(names(Test) == "UNIT_SALES")]
test_labels <- Test$UNIT_SALES
# Convert data to DMatrix format for XGBoost
dtrain <- xgb.DMatrix(data = as.matrix(train_features), label = train_labels)
dtest <- xgb.DMatrix(data = as.matrix(test_features), label = test_labels)
# Assuming 'params' and 'best_nrounds' are defined as before
# Train the final model without DOLLAR_SALES
model_xgb_no_dollar_sales <- xgb.train(
params = params,
data = dtrain,
nrounds = best_nrounds
)
# Make predictions and evaluate the model
train_pred <- predict(model_xgb_no_dollar_sales, dtrain)
test_pred <- predict(model_xgb_no_dollar_sales, dtest)
train_rmse <- sqrt(mean((train_labels - train_pred)^2))
test_rmse <- sqrt(mean((test_labels - test_pred)^2))
# Calculate R-squared for the training set
sst_train <- sum((train_labels - mean(train_labels)) ^ 2)
ssr_train <- sum((train_labels - train_pred) ^ 2)
r_squared_train <- 1 - (ssr_train / sst_train)
# Calculate R-squared for the test set
sst_test <- sum((test_labels - mean(test_labels)) ^ 2)
ssr_test <- sum((test_labels - test_pred) ^ 2)
r_squared_test <- 1 - (ssr_test / sst_test)
train_mape <- mean(abs((train_labels - train_pred) / train_labels)) * 100
test_mape <- mean(abs((test_labels - test_pred) / test_labels)) * 100
train_mae <- mean(abs(train_labels - train_pred))
test_mae <- mean(abs(test_labels - test_pred))
# Correcting Residuals Data Frame
# Assuming 'train_labels' and 'test_labels' contain the actual values,
# and 'train_pred' and 'test_pred' contain your model's predictions:
residuals_train <- train_labels - train_pred
residuals_test <- test_labels - test_pred
residuals_data <- data.frame(
Residuals = c(residuals_train, residuals_test),
Dataset = c(rep('Training', length(residuals_train)), rep('Test', length(residuals_test)))
)
# Now plotting residuals with corrected data
ggplot(residuals_data, aes(x = Residuals, fill = Dataset)) +
geom_histogram(binwidth = 1, position = 'identity', alpha = 0.6) +
facet_wrap(~ Dataset) +
ggtitle('Residuals Distribution')
# Assuming train_labels, test_labels, train_pred, and test_pred are correctly defined
# Adjusted Actual vs. Predicted Data Preparation
actual_pred_data <- data.frame(
Actual = c(train_labels, test_labels),
Predicted = c(train_pred, test_pred),
Dataset = c(rep('Training', length(train_labels)), rep('Test', length(test_labels)))
)
# Plotting Actual vs. Predicted Values
ggplot(actual_pred_data, aes(x = Actual, y = Predicted, colour = Dataset)) +
geom_point(alpha = 0.6) +
geom_abline(intercept = 0, slope = 1, linetype = 'dashed', color = 'red') +
xlab('Actual Values') +
ylab('Predicted Values') +
scale_colour_manual(values = c('Training' = 'blue', 'Test' = 'red')) +
ggtitle('Actual vs. Predicted Values')
cat("Model Performance Metrics:\n",
"--------------------------\n",
"Training RMSE: ", train_rmse, "\n",
"Test RMSE: ", test_rmse, "\n",
"Training R-squared: ", r_squared_train, "\n",
"Test R-squared: ", r_squared_test, "\n",
"Training MAE: ", train_mae, "\n",
"Test MAE: ", test_mae, "\n",
"Training MAPE: ", train_mape, "%\n",
"Test MAPE: ", test_mape, "%\n", sep="")
## Model Performance Metrics:
## --------------------------
## Training RMSE: 41.82988
## Test RMSE: 41.64189
## Training R-squared: 0.5126277
## Test R-squared: 0.5098883
## Training MAE: 26.72833
## Test MAE: 26.70301
## Training MAPE: 245.5749%
## Test MAPE: 242.3915%
# Calculate feature importance
importance_matrix2 <- xgb.importance(feature_names = colnames(train_features), model = model_xgb_no_dollar_sales)
# View the feature importance scores
print(importance_matrix2)
## Feature Gain Cover Frequency
## <char> <num> <num> <num>
## 1: POP_SQMI 3.216350e-01 3.099819e-01 0.3096774194
## 2: ITEM_BEAUTIFUL GREENER PLUM 1.074171e-01 3.090875e-02 0.0438709677
## 3: WEEKS_SINCE_LAUNCH 6.342494e-02 6.155881e-02 0.1195698925
## 4: REGION_SOCAL 5.763006e-02 3.082725e-02 0.0417204301
## 5: ITEM_RAINING THRASHED PLUM 4.364411e-02 9.919278e-03 0.0225089606
## 6: 12SMALL 12ONE CUP 4.290751e-02 1.348118e-02 0.0210752688
## 7: BRAND_BEAUTIFUL GREENER 4.080563e-02 1.230483e-02 0.0203584229
## 8: 20SMALL MULTI JUG 3.799602e-02 2.140640e-02 0.0278136201
## 9: REGION_ARIZONA 3.096537e-02 1.168918e-02 0.0365591398
## 10: REGION_MOUNTAIN 3.034199e-02 7.063550e-03 0.0127598566
## 11: 12SMALL MLT PLASTICS JUG 2.540258e-02 6.411313e-02 0.0255197133
## 12: .5L 6ONE JUG 2.513305e-02 6.938326e-02 0.0210752688
## 13: REGION_NORTHERN 1.905685e-02 4.979571e-03 0.0090322581
## 14: 12SMALL 18ONE CUP 1.496511e-02 5.381762e-02 0.0170609319
## 15: ITEM_ZIZZLES PLUM 1.410018e-02 1.020220e-02 0.0067383513
## 16: REGION_KANSAS 1.285587e-02 2.211024e-02 0.0117562724
## 17: WEEK_OF_YEAR 1.082639e-02 4.355395e-02 0.0771326165
## 18: 12SMALL 24ONE CUP 1.007761e-02 4.561881e-02 0.0183512545
## 19: REGION_CALI_NEVADA 9.864727e-03 1.202192e-02 0.0127598566
## 20: BRAND_HILL MOISTURE THRASHED APPLE 8.640680e-03 3.853981e-03 0.0087455197
## 21: REGION_NOCAL 8.421951e-03 1.689332e-02 0.0364157706
## 22: REGION_COLORADO 8.022730e-03 3.949575e-03 0.0149103943
## 23: CALORIC_SEGMENT 7.846218e-03 2.547401e-03 0.0068817204
## 24: ITEM_ SMASH SUNSET 7.362977e-03 2.615727e-03 0.0043010753
## 25: 12SMALL 20ONE CUP 7.111921e-03 3.886452e-02 0.0124731183
## 26: BRAND_SINGLE GROUP 5.698619e-03 4.076979e-03 0.0028673835
## 27: 12SMALL 24ONE PLASTICS JUG 5.310337e-03 3.481718e-02 0.0106093190
## 28: REGION_NEWMEXICO 4.566905e-03 5.244769e-03 0.0063082437
## 29: 12SMALL 6ONE CUP 4.553726e-03 2.236744e-02 0.0067383513
## 30: REGION_PRAIRIE 4.033372e-03 7.846924e-03 0.0087455197
## 31: REGION_DESERT_SW 3.336404e-03 1.188687e-03 0.0170609319
## 32: BRAND_DIET SMASH 3.194828e-03 7.460710e-04 0.0014336918
## 33: ITEM_GO-DAY 1.762084e-03 8.690429e-03 0.0031541219
## 34: BRAND_GO-DAY 5.654087e-04 4.401173e-03 0.0012903226
## 35: BRAND_FANTASMIC 4.786928e-04 6.941731e-03 0.0021505376
## 36: ITEM_FANTASMIC CUSTARD APPLE PLUM 4.307299e-05 1.227971e-05 0.0005734767
## Feature Gain Cover Frequency
xgb.plot.importance(importance_matrix = importance_matrix2)
if (!requireNamespace("pdp", quietly = TRUE)) install.packages("pdp")
if (!requireNamespace("xgboost", quietly = TRUE)) install.packages("xgboost")
library(pdp)
##
## Attaching package: 'pdp'
## The following object is masked from 'package:purrr':
##
## partial
library(xgboost)
pdp::partial(model_xgb_no_dollar_sales, pred.var = "WEEK_OF_YEAR", train = train_features)
## WEEK_OF_YEAR yhat
## 1 1.00 42.33142
## 2 2.04 42.13614
## 3 3.08 42.09447
## 4 4.12 41.77467
## 5 5.16 41.66057
## 6 6.20 42.74848
## 7 7.24 42.75283
## 8 8.28 42.74383
## 9 9.32 42.74452
## 10 10.36 42.90974
## 11 11.40 42.98448
## 12 12.44 42.97560
## 13 13.48 45.00818
## 14 14.52 45.28822
## 15 15.56 45.22283
## 16 16.60 45.18938
## 17 17.64 45.19483
## 18 18.68 45.19402
## 19 19.72 46.63148
## 20 20.76 46.62565
## 21 21.80 46.84449
## 22 22.84 46.85041
## 23 23.88 46.84909
## 24 24.92 46.87301
## 25 25.96 46.88228
## 26 27.00 46.88350
## 27 28.04 46.87881
## 28 29.08 46.88468
## 29 30.12 46.88782
## 30 31.16 46.89002
## 31 32.20 46.93946
## 32 33.24 46.93441
## 33 34.28 46.91926
## 34 35.32 46.92555
## 35 36.36 49.52571
## 36 37.40 49.54256
## 37 38.44 49.17260
## 38 39.48 49.16802
## 39 40.52 49.18255
## 40 41.56 48.74465
## 41 42.60 48.30548
## 42 43.64 48.19900
## 43 44.68 48.13644
## 44 45.72 48.25533
## 45 46.76 48.24118
## 46 47.80 50.63079
## 47 48.84 43.79528
## 48 49.88 43.79441
## 49 50.92 44.78383
## 50 51.96 50.41952
## 51 53.00 47.26535
pd <- partial(model_xgb_no_dollar_sales, pred.var = "WEEK_OF_YEAR", train = train_features, grid.resolution = 20)
# Default PDP
pdp1 <- plotPartial(pd, plot = TRUE)
# plot
grid.arrange(pdp1)
Based on the plum Energy Drink innovation datafram we expect the best 13 weeks to be between about weeks 33 and weeks 46.
# Set up
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, skimr, knitr, caret, readr,
ggplot2, dplyr, tidymodels, pROC, xgboost, doParallel, vip, DALEXtra, pdp, DALEX, gridExtra)
plum <- read_csv("plum_tableau.csv")
## Rows: 108886 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): CATEGORY, MANUFACTURER, BRAND, PACKAGE, ITEM, REGION, SEASON, PACK...
## dbl (8): MARKET_KEY, CALORIC_SEGMENT, UNIT_SALES, DOLLAR_SALES, POP_SQMI, M...
## date (2): DATE, min_launch_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(plum)
## spc_tbl_ [108,886 × 19] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ MARKET_KEY : num [1:108886] 1 1 1 1 1 1 1 1 1 1 ...
## $ DATE : Date[1:108886], format: "2022-06-18" "2022-04-30" ...
## $ CALORIC_SEGMENT : num [1:108886] 1 1 1 1 1 1 1 1 1 1 ...
## $ CATEGORY : chr [1:108886] "SSD" "SSD" "SSD" "SSD" ...
## $ UNIT_SALES : num [1:108886] 1 14 18 13 19 4 29 35 75 25 ...
## $ DOLLAR_SALES : num [1:108886] 4.62 86.86 89.73 65.6 72.93 ...
## $ MANUFACTURER : chr [1:108886] "JOLLYS" "JOLLYS" "JOLLYS" "JOLLYS" ...
## $ BRAND : chr [1:108886] "HILL MOISTURE THRASHED APPLE" "HILL MOISTURE THRASHED APPLE" "HILL MOISTURE THRASHED APPLE" "HILL MOISTURE THRASHED APPLE" ...
## $ PACKAGE : chr [1:108886] "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" ...
## $ ITEM : chr [1:108886] "RAINING THRASHED PLUM" "RAINING THRASHED PLUM" "RAINING THRASHED PLUM" "RAINING THRASHED PLUM" ...
## $ POP_SQMI : num [1:108886] 1.2 1.2 1.2 1.2 1.2 ...
## $ REGION : chr [1:108886] "NORTHERN" "NORTHERN" "NORTHERN" "NORTHERN" ...
## $ MONTH : num [1:108886] 6 4 12 7 11 6 1 11 10 12 ...
## $ SEASON : chr [1:108886] "SUMMER" "SPRING" "WINTER" "SUMMER" ...
## $ PACKAGE2 : chr [1:108886] "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" ...
## $ ENERGY_DRINK : num [1:108886] 0 0 0 0 0 0 0 0 0 0 ...
## $ CALORIC_SEGMENT_TEXT: chr [1:108886] NA NA NA NA ...
## $ min_launch_date : Date[1:108886], format: "2021-09-04" "2021-09-04" ...
## $ WEEKS_SINCE_LAUNCH : num [1:108886] 41 34 14 47 12 40 20 9 6 13 ...
## - attr(*, "spec")=
## .. cols(
## .. MARKET_KEY = col_double(),
## .. DATE = col_date(format = ""),
## .. CALORIC_SEGMENT = col_double(),
## .. CATEGORY = col_character(),
## .. UNIT_SALES = col_double(),
## .. DOLLAR_SALES = col_double(),
## .. MANUFACTURER = col_character(),
## .. BRAND = col_character(),
## .. PACKAGE = col_character(),
## .. ITEM = col_character(),
## .. POP_SQMI = col_double(),
## .. REGION = col_character(),
## .. MONTH = col_double(),
## .. SEASON = col_character(),
## .. PACKAGE2 = col_character(),
## .. ENERGY_DRINK = col_double(),
## .. CALORIC_SEGMENT_TEXT = col_character(),
## .. min_launch_date = col_date(format = ""),
## .. WEEKS_SINCE_LAUNCH = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
#remove any rows that have PACKAGE containing JUG
plum <- plum %>% filter(!str_detect(PACKAGE, "JUG"))
#remove any rows that have PACKAGE containing JUG
plum <- plum %>% filter(!str_detect(PACKAGE2, "JUG"))
#remove any rows containing SMASH SUNSET
plum <- plum %>% filter(!str_detect(ITEM, "SMASH"))
# #remove any rows containing FANTASMIC
# plum <- plum %>% filter(!str_detect(ITEM, "FANTASMIC"))
#
# #remove any rows containing GO DAY
# df <- df %>% filter(!str_detect(ITEM, "GO DAY"))
print(unique(df$ITEM))
## NULL
print(unique(df$PACKAGE))
## NULL
# Assuming you have already loaded your data into plum dataframe
# Remove the MARKET_KEY column
plum <- plum[, !names(plum) %in% "MARKET_KEY"]
# Identify categorical variables
categorical_cols <- c( "MANUFACTURER", "BRAND", "PACKAGE", "REGION", "SEASON")
# One-hot encode categorical variables
encoded_data <- dummyVars("~.", data = plum[categorical_cols], fullRank = TRUE)
encoded_data <- predict(encoded_data, newdata = plum[categorical_cols])
# Bind the encoded variables with the original dataframe
plum_encoded <- cbind(plum, encoded_data)
# Remove the original categorical variables
plum_encoded <- plum_encoded[, !(names(plum_encoded) %in% categorical_cols)]
# Now you can proceed with the modeling process
str(plum_encoded)
## 'data.frame': 43366 obs. of 33 variables:
## $ DATE : Date, format: "2022-06-18" "2022-04-30" ...
## $ CALORIC_SEGMENT : num 1 1 1 1 1 1 1 1 1 1 ...
## $ CATEGORY : chr "SSD" "SSD" "SSD" "SSD" ...
## $ UNIT_SALES : num 1 14 18 13 19 4 29 35 75 25 ...
## $ DOLLAR_SALES : num 4.62 86.86 89.73 65.6 72.93 ...
## $ ITEM : chr "RAINING THRASHED PLUM" "RAINING THRASHED PLUM" "RAINING THRASHED PLUM" "RAINING THRASHED PLUM" ...
## $ POP_SQMI : num 1.2 1.2 1.2 1.2 1.2 ...
## $ MONTH : num 6 4 12 7 11 6 1 11 10 12 ...
## $ PACKAGE2 : chr "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" ...
## $ ENERGY_DRINK : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CALORIC_SEGMENT_TEXT : chr NA NA NA NA ...
## $ min_launch_date : Date, format: "2021-09-04" "2021-09-04" ...
## $ WEEKS_SINCE_LAUNCH : num 41 34 14 47 12 40 20 9 6 13 ...
## $ MANUFACTURERJOLLYS : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BRANDHILL MOISTURE THRASHED APPLE: num 1 1 1 1 1 1 1 1 1 1 ...
## $ BRANDSINGLE GROUP : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PACKAGE12SMALL 18ONE CUP : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PACKAGE12SMALL 20ONE CUP : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PACKAGE12SMALL 24ONE CUP : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PACKAGE12SMALL 6ONE CUP : num 0 0 0 0 0 0 0 0 0 0 ...
## $ REGIONCALI_NEVADA : num 0 0 0 0 0 0 0 0 0 0 ...
## $ REGIONCOLORADO : num 0 0 0 0 0 0 0 0 0 0 ...
## $ REGIONDESERT_SW : num 0 0 0 0 0 0 0 0 0 0 ...
## $ REGIONKANSAS : num 0 0 0 0 0 0 0 0 0 0 ...
## $ REGIONMOUNTAIN : num 0 0 0 0 0 0 0 0 0 0 ...
## $ REGIONNEWMEXICO : num 0 0 0 0 0 0 0 0 0 0 ...
## $ REGIONNOCAL : num 0 0 0 0 0 0 0 0 0 0 ...
## $ REGIONNORTHERN : num 1 1 1 1 1 1 1 1 1 1 ...
## $ REGIONPRAIRIE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ REGIONSOCAL : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SEASONSPRING : num 0 1 0 0 0 0 0 0 0 0 ...
## $ SEASONSUMMER : num 1 0 0 1 0 1 0 0 0 0 ...
## $ SEASONWINTER : num 0 0 1 0 0 0 1 0 0 1 ...
# List to store unique values for each variable
unique_values_list <- list()
# Columns to get unique values for
columns_to_get_unique_values <- c("BRAND", "PACKAGE", "ITEM", "REGION", "SEASON")
# Get unique values for each variable and store in the list
for (col in columns_to_get_unique_values) {
unique_values_list[[col]] <- unique(plum[[col]])
}
# Loop over unique regions and create new columns
for (region in unique_values_list$REGION) {
plum[[region]] <- as.integer(grepl(region, plum$REGION))
}
# Loop over unique brands and create new columns
for (brand in unique_values_list$BRAND) {
plum[[brand]] <- as.integer(grepl(brand, plum$BRAND))
}
# Loop over unique brands and create new columns
for (item in unique_values_list$ITEM) {
plum[[item]] <- as.integer(grepl(item, plum$ITEM))
}
# Loop over unique regions and create new columns
for (package in unique_values_list$PACKAGE) {
plum[[package]] <- as.integer(grepl(package, plum$PACKAGE))
}
# Loop over unique regions and create new columns
for (season in unique_values_list$SEASON) {
plum[[season]] <- as.integer(grepl(season, plum$SEASON))
}
plum <- plum[, !names(plum) == ""]
# Remove unnecessary columns
one_hot_plum <- plum %>%
select(-CALORIC_SEGMENT, -CATEGORY, -MANUFACTURER, -BRAND, -REGION, -PACKAGE, -ITEM)
head(one_hot_plum)
## # A tibble: 6 × 37
## DATE UNIT_SALES DOLLAR_SALES POP_SQMI MONTH SEASON PACKAGE2 ENERGY_DRINK
## <date> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl>
## 1 2022-06-18 1 4.62 1.20 6 SUMMER CUP 12 … 0
## 2 2022-04-30 14 86.9 1.20 4 SPRING CUP 12 … 0
## 3 2021-12-11 18 89.7 1.20 12 WINTER CUP 12 … 0
## 4 2022-07-30 13 65.6 1.20 7 SUMMER CUP 12 … 0
## 5 2021-11-27 19 72.9 1.20 11 FALL CUP 12 … 0
## 6 2022-06-11 4 25.6 1.20 6 SUMMER CUP 12 … 0
## # ℹ 29 more variables: CALORIC_SEGMENT_TEXT <chr>, min_launch_date <date>,
## # WEEKS_SINCE_LAUNCH <dbl>, NORTHERN <int>, CALI_NEVADA <int>,
## # DESERT_SW <int>, MOUNTAIN <int>, SOCAL <int>, PRAIRIE <int>, ARIZONA <int>,
## # NEWMEXICO <int>, NOCAL <int>, COLORADO <int>, KANSAS <int>,
## # `HILL MOISTURE THRASHED APPLE` <int>, `BEAUTIFUL GREENER` <int>,
## # `SINGLE GROUP` <int>, `RAINING THRASHED PLUM` <int>,
## # `BEAUTIFUL GREENER PLUM` <int>, `ZIZZLES PLUM` <int>, …
write.csv(one_hot_plum, "one_hot_plum.csv", row.names = FALSE)
#cleanup all objects except one_hot_plum
#rm(list = setdiff(ls(), "one_hot_plum"))
# Load and prepare dataset
df1 <- read.csv("one_hot_plum.csv")
df1 <- df1 %>%
select(-DATE, -MONTH, -WINTER, -SPRING, -FALL, -DOLLAR_SALES, -SUMMER, -SEASON)
# Summarize the dataset
skimr::skim(df1)
| Name | df1 |
| Number of rows | 43366 |
| Number of columns | 29 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| logical | 1 |
| numeric | 26 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| PACKAGE2 | 0 | 1 | 22 | 26 | 0 | 5 | 0 |
| min_launch_date | 0 | 1 | 10 | 10 | 0 | 2 | 0 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| CALORIC_SEGMENT_TEXT | 43366 | 0 | NaN | : |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| UNIT_SALES | 0 | 1 | 55.88 | 90.12 | 1.00 | 9.00 | 26.00 | 63.00 | 2716.00 | ▇▁▁▁▁ |
| POP_SQMI | 0 | 1 | 1713.02 | 1965.76 | 0.18 | 52.08 | 836.76 | 3191.96 | 6769.35 | ▇▂▂▁▁ |
| ENERGY_DRINK | 0 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ▁▁▇▁▁ |
| WEEKS_SINCE_LAUNCH | 0 | 1 | 59.74 | 35.11 | 0.00 | 30.00 | 58.00 | 86.00 | 128.00 | ▇▇▇▆▅ |
| NORTHERN | 0 | 1 | 0.15 | 0.36 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| CALI_NEVADA | 0 | 1 | 0.03 | 0.17 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| DESERT_SW | 0 | 1 | 0.07 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| MOUNTAIN | 0 | 1 | 0.09 | 0.29 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| SOCAL | 0 | 1 | 0.16 | 0.36 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| PRAIRIE | 0 | 1 | 0.02 | 0.15 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| ARIZONA | 0 | 1 | 0.24 | 0.43 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| NEWMEXICO | 0 | 1 | 0.04 | 0.19 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| NOCAL | 0 | 1 | 0.06 | 0.23 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| COLORADO | 0 | 1 | 0.12 | 0.33 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| KANSAS | 0 | 1 | 0.02 | 0.14 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| HILL.MOISTURE.THRASHED.APPLE | 0 | 1 | 0.34 | 0.47 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▅ |
| BEAUTIFUL.GREENER | 0 | 1 | 0.58 | 0.49 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▆▁▁▁▇ |
| SINGLE.GROUP | 0 | 1 | 0.08 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| RAINING..THRASHED.PLUM | 0 | 1 | 0.34 | 0.47 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▅ |
| BEAUTIFUL.GREENER..PLUM | 0 | 1 | 0.58 | 0.49 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▆▁▁▁▇ |
| ZIZZLES..PLUM | 0 | 1 | 0.08 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| X12SMALL.12ONE.CUP | 0 | 1 | 0.88 | 0.32 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | ▁▁▁▁▇ |
| X12SMALL.6ONE.CUP | 0 | 1 | 0.01 | 0.08 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| X12SMALL.24ONE.CUP | 0 | 1 | 0.07 | 0.25 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| X12SMALL.20ONE.CUP | 0 | 1 | 0.01 | 0.10 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| X12SMALL.18ONE.CUP | 0 | 1 | 0.03 | 0.17 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
#Remove outliers in top 1% of Unit Sales.
df1 <- df1 %>% filter(UNIT_SALES < quantile(UNIT_SALES, 0.99))
# Split the data
set.seed(123)
df_testtrn <- initial_split(df1, prop = 0.8, strata = UNIT_SALES)
Train <- training(df_testtrn)
Test <- testing(df_testtrn)
# Prepare features and labels for XGBoost
train_features <- Train[, -which(names(Train) == "UNIT_SALES")]
train_labels <- Train$UNIT_SALES
test_features <- Test[, -which(names(Test) == "UNIT_SALES")]
test_labels <- Test$UNIT_SALES
train_features <- lapply(train_features, as.numeric)
## Warning in lapply(train_features, as.numeric): NAs introduced by coercion
## Warning in lapply(train_features, as.numeric): NAs introduced by coercion
train_labels <- as.numeric(train_labels)
train_features <- lapply(train_features, as.numeric)
# Convert the list to a matrix
train_matrix <- do.call(cbind, train_features)
# Create DMatrix
train_dmatrix <- xgb.DMatrix(data = train_matrix, label = train_labels)
test_features <- lapply(test_features, as.numeric)
## Warning in lapply(test_features, as.numeric): NAs introduced by coercion
## Warning in lapply(test_features, as.numeric): NAs introduced by coercion
# Convert the list to a matrix
test_matrix <- do.call(cbind, test_features)
# Create DMatrix
test_dmatrix <- xgb.DMatrix(data = test_matrix, label = test_labels)
# Define XGBoost parameters
set.seed(123)
params <- list(
booster = "gbtree",
objective = "reg:squarederror",
eval_metric = "rmse",
eta = 0.05,
max_depth = 3,
min_child_weight = 1,
subsample = 0.7,
colsample_bytree = 0.6,
reg_lambda = 1,
reg_alpha = 0
)
# #use all cores except for one
# doParallel::registerDoParallel(cores = parallel::detectCores() - 1)
# Perform cross-validation to find the optimal number of boosting rounds
cv_results <- xgb.cv(
params = params,
data = train_dmatrix,
nfold = 5,
nrounds = 500, # Changed from 'num_boost_round' to 'nrounds'
early_stopping_rounds = 10,
metrics = "rmse",
seed = 123
)
## [1] train-rmse:81.630267+0.229463 test-rmse:81.626172+0.677657
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
##
## [2] train-rmse:79.881361+0.214846 test-rmse:79.871352+0.708257
## [3] train-rmse:78.272570+0.223207 test-rmse:78.259611+0.724948
## [4] train-rmse:76.823268+0.220764 test-rmse:76.809766+0.713121
## [5] train-rmse:75.485726+0.263909 test-rmse:75.475546+0.668567
## [6] train-rmse:74.292984+0.212330 test-rmse:74.288742+0.693536
## [7] train-rmse:73.127865+0.209287 test-rmse:73.129229+0.710464
## [8] train-rmse:72.043028+0.174393 test-rmse:72.043241+0.769906
## [9] train-rmse:71.080869+0.169583 test-rmse:71.082009+0.790404
## [10] train-rmse:70.164555+0.190074 test-rmse:70.167679+0.758776
## [11] train-rmse:69.370928+0.240068 test-rmse:69.375989+0.722733
## [12] train-rmse:68.589532+0.273396 test-rmse:68.594246+0.662321
## [13] train-rmse:67.829737+0.313070 test-rmse:67.837536+0.657090
## [14] train-rmse:67.158882+0.261371 test-rmse:67.165648+0.670325
## [15] train-rmse:66.592793+0.226190 test-rmse:66.601380+0.677492
## [16] train-rmse:66.031029+0.253788 test-rmse:66.049014+0.669672
## [17] train-rmse:65.488243+0.220998 test-rmse:65.505498+0.741396
## [18] train-rmse:65.014802+0.198313 test-rmse:65.035193+0.735735
## [19] train-rmse:64.557788+0.201890 test-rmse:64.580867+0.718667
## [20] train-rmse:64.170577+0.200613 test-rmse:64.196890+0.731885
## [21] train-rmse:63.783276+0.214663 test-rmse:63.802089+0.659697
## [22] train-rmse:63.441278+0.258247 test-rmse:63.456047+0.643660
## [23] train-rmse:63.093087+0.289292 test-rmse:63.109051+0.621254
## [24] train-rmse:62.756751+0.298107 test-rmse:62.780768+0.590581
## [25] train-rmse:62.478861+0.286614 test-rmse:62.505645+0.566679
## [26] train-rmse:62.198562+0.255273 test-rmse:62.228255+0.579993
## [27] train-rmse:61.948781+0.264637 test-rmse:61.986840+0.583754
## [28] train-rmse:61.690860+0.260911 test-rmse:61.727381+0.586151
## [29] train-rmse:61.479941+0.271026 test-rmse:61.528549+0.589573
## [30] train-rmse:61.268712+0.293149 test-rmse:61.316958+0.585824
## [31] train-rmse:61.107641+0.293732 test-rmse:61.153426+0.561179
## [32] train-rmse:60.932865+0.309358 test-rmse:60.980867+0.536239
## [33] train-rmse:60.752950+0.285379 test-rmse:60.798071+0.539515
## [34] train-rmse:60.588747+0.289656 test-rmse:60.624855+0.555171
## [35] train-rmse:60.460900+0.293315 test-rmse:60.500084+0.543081
## [36] train-rmse:60.291709+0.315158 test-rmse:60.339228+0.531428
## [37] train-rmse:60.147875+0.330357 test-rmse:60.193080+0.530971
## [38] train-rmse:60.006961+0.342518 test-rmse:60.056798+0.560064
## [39] train-rmse:59.882822+0.318971 test-rmse:59.935740+0.542633
## [40] train-rmse:59.741222+0.346142 test-rmse:59.795272+0.517427
## [41] train-rmse:59.657343+0.351221 test-rmse:59.711949+0.494237
## [42] train-rmse:59.566380+0.366305 test-rmse:59.621232+0.518428
## [43] train-rmse:59.457046+0.317887 test-rmse:59.514116+0.538085
## [44] train-rmse:59.356065+0.304824 test-rmse:59.411539+0.549573
## [45] train-rmse:59.244664+0.326972 test-rmse:59.303286+0.539908
## [46] train-rmse:59.168666+0.353693 test-rmse:59.228497+0.558409
## [47] train-rmse:59.019725+0.290506 test-rmse:59.084133+0.563688
## [48] train-rmse:58.945742+0.286975 test-rmse:59.012749+0.556246
## [49] train-rmse:58.851308+0.269773 test-rmse:58.917844+0.569225
## [50] train-rmse:58.775141+0.268298 test-rmse:58.839237+0.547050
## [51] train-rmse:58.700958+0.281126 test-rmse:58.767071+0.567100
## [52] train-rmse:58.640054+0.278511 test-rmse:58.709655+0.567117
## [53] train-rmse:58.550649+0.273418 test-rmse:58.617611+0.592622
## [54] train-rmse:58.477617+0.280776 test-rmse:58.544208+0.574535
## [55] train-rmse:58.407846+0.302689 test-rmse:58.471429+0.572017
## [56] train-rmse:58.356404+0.321335 test-rmse:58.423785+0.587572
## [57] train-rmse:58.290028+0.294739 test-rmse:58.360798+0.589571
## [58] train-rmse:58.175656+0.295934 test-rmse:58.251539+0.552968
## [59] train-rmse:58.067886+0.260151 test-rmse:58.147153+0.587540
## [60] train-rmse:58.001803+0.256351 test-rmse:58.081751+0.584127
## [61] train-rmse:57.932900+0.247721 test-rmse:58.010225+0.595180
## [62] train-rmse:57.875039+0.251259 test-rmse:57.954666+0.611563
## [63] train-rmse:57.827711+0.251043 test-rmse:57.910608+0.615903
## [64] train-rmse:57.758052+0.279961 test-rmse:57.843716+0.598654
## [65] train-rmse:57.681478+0.257849 test-rmse:57.767914+0.631803
## [66] train-rmse:57.637336+0.255986 test-rmse:57.723125+0.643561
## [67] train-rmse:57.533389+0.190269 test-rmse:57.614456+0.607521
## [68] train-rmse:57.479300+0.191485 test-rmse:57.557656+0.620173
## [69] train-rmse:57.410259+0.199307 test-rmse:57.490650+0.615157
## [70] train-rmse:57.360313+0.202300 test-rmse:57.441984+0.625113
## [71] train-rmse:57.271802+0.215720 test-rmse:57.361508+0.636964
## [72] train-rmse:57.209691+0.213607 test-rmse:57.300893+0.666422
## [73] train-rmse:57.174996+0.214229 test-rmse:57.267039+0.662422
## [74] train-rmse:57.153483+0.213064 test-rmse:57.245148+0.663866
## [75] train-rmse:57.096608+0.229833 test-rmse:57.191958+0.647759
## [76] train-rmse:57.033672+0.215826 test-rmse:57.126512+0.638247
## [77] train-rmse:56.970746+0.175763 test-rmse:57.061061+0.649533
## [78] train-rmse:56.901679+0.207057 test-rmse:56.994932+0.657498
## [79] train-rmse:56.833526+0.177909 test-rmse:56.920276+0.647096
## [80] train-rmse:56.762319+0.202860 test-rmse:56.849101+0.686820
## [81] train-rmse:56.739632+0.208370 test-rmse:56.827357+0.693597
## [82] train-rmse:56.648016+0.234231 test-rmse:56.740654+0.691097
## [83] train-rmse:56.600933+0.249074 test-rmse:56.698080+0.685679
## [84] train-rmse:56.571267+0.243906 test-rmse:56.666838+0.700942
## [85] train-rmse:56.507834+0.246624 test-rmse:56.609482+0.693862
## [86] train-rmse:56.452263+0.248738 test-rmse:56.551870+0.701024
## [87] train-rmse:56.413453+0.263145 test-rmse:56.517679+0.702240
## [88] train-rmse:56.359159+0.265078 test-rmse:56.466690+0.728637
## [89] train-rmse:56.309135+0.250641 test-rmse:56.419189+0.728227
## [90] train-rmse:56.226592+0.225650 test-rmse:56.338204+0.662713
## [91] train-rmse:56.147147+0.182210 test-rmse:56.256643+0.670402
## [92] train-rmse:56.076532+0.191742 test-rmse:56.186993+0.674354
## [93] train-rmse:56.046896+0.186431 test-rmse:56.156393+0.672504
## [94] train-rmse:56.007977+0.199387 test-rmse:56.117867+0.682580
## [95] train-rmse:55.960778+0.210375 test-rmse:56.072375+0.702053
## [96] train-rmse:55.917261+0.187397 test-rmse:56.024519+0.703274
## [97] train-rmse:55.871236+0.198597 test-rmse:55.976201+0.718171
## [98] train-rmse:55.839504+0.198780 test-rmse:55.946158+0.726819
## [99] train-rmse:55.809102+0.194073 test-rmse:55.917547+0.736123
## [100] train-rmse:55.763872+0.192004 test-rmse:55.876098+0.730846
## [101] train-rmse:55.731571+0.191632 test-rmse:55.841217+0.740818
## [102] train-rmse:55.695517+0.217581 test-rmse:55.807055+0.736365
## [103] train-rmse:55.624197+0.253877 test-rmse:55.738564+0.772335
## [104] train-rmse:55.559861+0.268020 test-rmse:55.682832+0.786981
## [105] train-rmse:55.519325+0.247517 test-rmse:55.645350+0.781678
## [106] train-rmse:55.484492+0.252326 test-rmse:55.615998+0.787121
## [107] train-rmse:55.456025+0.224627 test-rmse:55.586589+0.761211
## [108] train-rmse:55.425135+0.239484 test-rmse:55.554160+0.767244
## [109] train-rmse:55.372286+0.228401 test-rmse:55.505225+0.767243
## [110] train-rmse:55.336798+0.224742 test-rmse:55.469656+0.755692
## [111] train-rmse:55.279331+0.192203 test-rmse:55.407375+0.734264
## [112] train-rmse:55.217986+0.204361 test-rmse:55.349583+0.746905
## [113] train-rmse:55.174985+0.230868 test-rmse:55.307083+0.774874
## [114] train-rmse:55.141846+0.244563 test-rmse:55.278286+0.788674
## [115] train-rmse:55.085143+0.216153 test-rmse:55.222362+0.764833
## [116] train-rmse:55.024239+0.233182 test-rmse:55.166613+0.775983
## [117] train-rmse:54.991152+0.239653 test-rmse:55.133260+0.781301
## [118] train-rmse:54.962796+0.244577 test-rmse:55.106155+0.782663
## [119] train-rmse:54.926258+0.246865 test-rmse:55.075595+0.781010
## [120] train-rmse:54.867396+0.250801 test-rmse:55.020095+0.781422
## [121] train-rmse:54.844965+0.259237 test-rmse:55.001040+0.790954
## [122] train-rmse:54.821032+0.264449 test-rmse:54.980359+0.797478
## [123] train-rmse:54.786947+0.255252 test-rmse:54.946311+0.790867
## [124] train-rmse:54.762009+0.252717 test-rmse:54.922246+0.785393
## [125] train-rmse:54.718246+0.225357 test-rmse:54.878878+0.766525
## [126] train-rmse:54.684889+0.231414 test-rmse:54.852156+0.754337
## [127] train-rmse:54.641425+0.249492 test-rmse:54.810879+0.768981
## [128] train-rmse:54.609445+0.238509 test-rmse:54.785242+0.775092
## [129] train-rmse:54.581857+0.243227 test-rmse:54.760809+0.768996
## [130] train-rmse:54.536256+0.257385 test-rmse:54.718157+0.745436
## [131] train-rmse:54.509029+0.261115 test-rmse:54.687466+0.735486
## [132] train-rmse:54.475014+0.234828 test-rmse:54.646354+0.692035
## [133] train-rmse:54.429854+0.208334 test-rmse:54.602112+0.694216
## [134] train-rmse:54.367643+0.193179 test-rmse:54.535427+0.690863
## [135] train-rmse:54.331437+0.182788 test-rmse:54.499966+0.676328
## [136] train-rmse:54.300474+0.175121 test-rmse:54.472845+0.674958
## [137] train-rmse:54.276207+0.188594 test-rmse:54.450935+0.685443
## [138] train-rmse:54.246681+0.192243 test-rmse:54.424247+0.683780
## [139] train-rmse:54.157299+0.184642 test-rmse:54.334394+0.677182
## [140] train-rmse:54.106344+0.198444 test-rmse:54.280891+0.649584
## [141] train-rmse:54.056257+0.183672 test-rmse:54.235390+0.655390
## [142] train-rmse:54.029176+0.177656 test-rmse:54.212888+0.660211
## [143] train-rmse:53.998538+0.164630 test-rmse:54.181777+0.637161
## [144] train-rmse:53.959065+0.166354 test-rmse:54.140648+0.639510
## [145] train-rmse:53.905649+0.173966 test-rmse:54.087415+0.630048
## [146] train-rmse:53.888002+0.171069 test-rmse:54.069365+0.635489
## [147] train-rmse:53.843474+0.154882 test-rmse:54.019988+0.597519
## [148] train-rmse:53.799445+0.151965 test-rmse:53.981716+0.615149
## [149] train-rmse:53.771360+0.139799 test-rmse:53.954076+0.625219
## [150] train-rmse:53.732207+0.128550 test-rmse:53.910070+0.604666
## [151] train-rmse:53.699590+0.123948 test-rmse:53.881789+0.601795
## [152] train-rmse:53.667627+0.130865 test-rmse:53.852119+0.616736
## [153] train-rmse:53.634653+0.119205 test-rmse:53.820714+0.625430
## [154] train-rmse:53.587645+0.142444 test-rmse:53.769894+0.620149
## [155] train-rmse:53.540948+0.143567 test-rmse:53.722554+0.629343
## [156] train-rmse:53.487746+0.137733 test-rmse:53.669454+0.644294
## [157] train-rmse:53.465815+0.131668 test-rmse:53.648748+0.639706
## [158] train-rmse:53.419272+0.137845 test-rmse:53.605304+0.631079
## [159] train-rmse:53.368737+0.151506 test-rmse:53.553924+0.631673
## [160] train-rmse:53.308563+0.138651 test-rmse:53.493494+0.582431
## [161] train-rmse:53.259921+0.139940 test-rmse:53.450387+0.592862
## [162] train-rmse:53.223558+0.126198 test-rmse:53.414197+0.587881
## [163] train-rmse:53.198955+0.133516 test-rmse:53.389879+0.581505
## [164] train-rmse:53.162419+0.124220 test-rmse:53.355610+0.571083
## [165] train-rmse:53.131560+0.129566 test-rmse:53.327911+0.579513
## [166] train-rmse:53.106261+0.118362 test-rmse:53.298729+0.545948
## [167] train-rmse:53.068000+0.153167 test-rmse:53.264944+0.530046
## [168] train-rmse:53.015458+0.159450 test-rmse:53.209961+0.538813
## [169] train-rmse:52.983381+0.158104 test-rmse:53.180160+0.545268
## [170] train-rmse:52.959562+0.150463 test-rmse:53.158061+0.527766
## [171] train-rmse:52.935396+0.142959 test-rmse:53.131941+0.545097
## [172] train-rmse:52.886603+0.166744 test-rmse:53.084112+0.528699
## [173] train-rmse:52.866571+0.165255 test-rmse:53.064859+0.512178
## [174] train-rmse:52.804424+0.155451 test-rmse:52.998700+0.509459
## [175] train-rmse:52.770706+0.175907 test-rmse:52.965344+0.512573
## [176] train-rmse:52.757397+0.170585 test-rmse:52.953940+0.508158
## [177] train-rmse:52.730016+0.157395 test-rmse:52.927338+0.511229
## [178] train-rmse:52.682640+0.157329 test-rmse:52.877060+0.522447
## [179] train-rmse:52.648153+0.152243 test-rmse:52.845066+0.498917
## [180] train-rmse:52.623483+0.143623 test-rmse:52.820560+0.499567
## [181] train-rmse:52.586091+0.170753 test-rmse:52.784392+0.487405
## [182] train-rmse:52.552466+0.150088 test-rmse:52.754478+0.500995
## [183] train-rmse:52.529281+0.151226 test-rmse:52.731807+0.506121
## [184] train-rmse:52.491531+0.153825 test-rmse:52.695661+0.470661
## [185] train-rmse:52.435932+0.184135 test-rmse:52.640909+0.452383
## [186] train-rmse:52.413296+0.199402 test-rmse:52.619422+0.446946
## [187] train-rmse:52.346619+0.208648 test-rmse:52.559863+0.447871
## [188] train-rmse:52.319457+0.223336 test-rmse:52.529680+0.445157
## [189] train-rmse:52.284153+0.217766 test-rmse:52.496169+0.451123
## [190] train-rmse:52.259389+0.205419 test-rmse:52.471109+0.463142
## [191] train-rmse:52.209733+0.217047 test-rmse:52.421514+0.461682
## [192] train-rmse:52.192552+0.216426 test-rmse:52.404620+0.450037
## [193] train-rmse:52.155634+0.209588 test-rmse:52.370776+0.470523
## [194] train-rmse:52.115550+0.202560 test-rmse:52.334571+0.499604
## [195] train-rmse:52.092087+0.205112 test-rmse:52.312068+0.487624
## [196] train-rmse:52.059447+0.207383 test-rmse:52.284108+0.513159
## [197] train-rmse:52.020032+0.204224 test-rmse:52.252744+0.514555
## [198] train-rmse:52.009217+0.205728 test-rmse:52.241914+0.521988
## [199] train-rmse:52.003194+0.206391 test-rmse:52.236938+0.521398
## [200] train-rmse:51.962240+0.237688 test-rmse:52.192219+0.529822
## [201] train-rmse:51.946828+0.249863 test-rmse:52.175337+0.534000
## [202] train-rmse:51.914253+0.264252 test-rmse:52.148062+0.523497
## [203] train-rmse:51.896456+0.249079 test-rmse:52.130681+0.519582
## [204] train-rmse:51.866326+0.260130 test-rmse:52.102796+0.503687
## [205] train-rmse:51.841111+0.255080 test-rmse:52.076538+0.510993
## [206] train-rmse:51.814327+0.246788 test-rmse:52.049333+0.515875
## [207] train-rmse:51.799398+0.244131 test-rmse:52.035073+0.523803
## [208] train-rmse:51.771106+0.231015 test-rmse:52.012317+0.528578
## [209] train-rmse:51.749950+0.232870 test-rmse:51.992807+0.540014
## [210] train-rmse:51.726653+0.251009 test-rmse:51.971759+0.541791
## [211] train-rmse:51.682140+0.262865 test-rmse:51.933076+0.569675
## [212] train-rmse:51.671846+0.261094 test-rmse:51.923722+0.570758
## [213] train-rmse:51.617854+0.255221 test-rmse:51.867466+0.545813
## [214] train-rmse:51.586885+0.279584 test-rmse:51.835227+0.561485
## [215] train-rmse:51.550567+0.258969 test-rmse:51.798920+0.548652
## [216] train-rmse:51.536433+0.260029 test-rmse:51.788176+0.549711
## [217] train-rmse:51.512754+0.253103 test-rmse:51.769537+0.560652
## [218] train-rmse:51.479070+0.247629 test-rmse:51.735128+0.578009
## [219] train-rmse:51.392100+0.265619 test-rmse:51.649909+0.578546
## [220] train-rmse:51.365021+0.278659 test-rmse:51.621507+0.595252
## [221] train-rmse:51.352615+0.283549 test-rmse:51.610475+0.593393
## [222] train-rmse:51.304614+0.291410 test-rmse:51.557559+0.601681
## [223] train-rmse:51.294926+0.289357 test-rmse:51.548150+0.604253
## [224] train-rmse:51.282344+0.293292 test-rmse:51.536923+0.610662
## [225] train-rmse:51.265122+0.286985 test-rmse:51.521889+0.608402
## [226] train-rmse:51.207154+0.283790 test-rmse:51.467357+0.633032
## [227] train-rmse:51.176420+0.270235 test-rmse:51.440416+0.627965
## [228] train-rmse:51.152071+0.272282 test-rmse:51.413688+0.631166
## [229] train-rmse:51.093520+0.292179 test-rmse:51.360106+0.630020
## [230] train-rmse:51.060541+0.305285 test-rmse:51.326923+0.652909
## [231] train-rmse:51.011456+0.321436 test-rmse:51.284448+0.653974
## [232] train-rmse:50.988841+0.309681 test-rmse:51.261488+0.632388
## [233] train-rmse:50.969569+0.303568 test-rmse:51.243048+0.614300
## [234] train-rmse:50.943983+0.301473 test-rmse:51.215808+0.608908
## [235] train-rmse:50.922203+0.307744 test-rmse:51.193374+0.620012
## [236] train-rmse:50.860435+0.285054 test-rmse:51.130823+0.610779
## [237] train-rmse:50.834252+0.304128 test-rmse:51.103216+0.632782
## [238] train-rmse:50.799644+0.318413 test-rmse:51.072520+0.623841
## [239] train-rmse:50.778742+0.308479 test-rmse:51.052386+0.624047
## [240] train-rmse:50.750556+0.319797 test-rmse:51.027065+0.642584
## [241] train-rmse:50.722183+0.335080 test-rmse:50.999459+0.654873
## [242] train-rmse:50.688058+0.333081 test-rmse:50.971423+0.661332
## [243] train-rmse:50.655667+0.332073 test-rmse:50.934046+0.655906
## [244] train-rmse:50.633679+0.335648 test-rmse:50.913231+0.661264
## [245] train-rmse:50.597141+0.330307 test-rmse:50.878455+0.671071
## [246] train-rmse:50.577970+0.337149 test-rmse:50.862143+0.678568
## [247] train-rmse:50.542562+0.355844 test-rmse:50.829991+0.698754
## [248] train-rmse:50.517680+0.347116 test-rmse:50.805375+0.683405
## [249] train-rmse:50.472970+0.337021 test-rmse:50.767970+0.676634
## [250] train-rmse:50.451522+0.339911 test-rmse:50.745561+0.674978
## [251] train-rmse:50.432129+0.340671 test-rmse:50.727929+0.674892
## [252] train-rmse:50.417412+0.337981 test-rmse:50.713877+0.673192
## [253] train-rmse:50.394618+0.349142 test-rmse:50.691315+0.664828
## [254] train-rmse:50.343162+0.369974 test-rmse:50.645195+0.688738
## [255] train-rmse:50.330032+0.368420 test-rmse:50.630270+0.687412
## [256] train-rmse:50.295396+0.366108 test-rmse:50.599215+0.685545
## [257] train-rmse:50.267211+0.383087 test-rmse:50.572509+0.697634
## [258] train-rmse:50.244212+0.399685 test-rmse:50.551951+0.699466
## [259] train-rmse:50.228644+0.385239 test-rmse:50.536837+0.693481
## [260] train-rmse:50.208809+0.377760 test-rmse:50.517713+0.686768
## [261] train-rmse:50.158949+0.384672 test-rmse:50.475248+0.682131
## [262] train-rmse:50.133027+0.378739 test-rmse:50.451532+0.685988
## [263] train-rmse:50.101055+0.380445 test-rmse:50.420908+0.697885
## [264] train-rmse:50.070740+0.400868 test-rmse:50.389424+0.706917
## [265] train-rmse:50.049657+0.412831 test-rmse:50.369297+0.709664
## [266] train-rmse:50.016947+0.410031 test-rmse:50.337030+0.696831
## [267] train-rmse:49.983327+0.411823 test-rmse:50.307918+0.686903
## [268] train-rmse:49.960177+0.414932 test-rmse:50.287154+0.686481
## [269] train-rmse:49.927445+0.395192 test-rmse:50.254636+0.671212
## [270] train-rmse:49.900721+0.412398 test-rmse:50.227456+0.674961
## [271] train-rmse:49.870063+0.400849 test-rmse:50.204033+0.678661
## [272] train-rmse:49.825654+0.382584 test-rmse:50.157693+0.687798
## [273] train-rmse:49.811453+0.390097 test-rmse:50.144145+0.687643
## [274] train-rmse:49.785182+0.393194 test-rmse:50.119111+0.674186
## [275] train-rmse:49.762545+0.402275 test-rmse:50.097474+0.690686
## [276] train-rmse:49.740347+0.399995 test-rmse:50.077029+0.694611
## [277] train-rmse:49.719806+0.398144 test-rmse:50.060300+0.696424
## [278] train-rmse:49.687613+0.393686 test-rmse:50.031561+0.690431
## [279] train-rmse:49.662784+0.362996 test-rmse:50.003932+0.674240
## [280] train-rmse:49.635479+0.346225 test-rmse:49.975956+0.675142
## [281] train-rmse:49.584697+0.336221 test-rmse:49.920168+0.637033
## [282] train-rmse:49.562219+0.318312 test-rmse:49.901838+0.633121
## [283] train-rmse:49.527981+0.306058 test-rmse:49.866872+0.623275
## [284] train-rmse:49.508551+0.312345 test-rmse:49.846062+0.618571
## [285] train-rmse:49.492505+0.322860 test-rmse:49.830495+0.616734
## [286] train-rmse:49.474078+0.314812 test-rmse:49.812887+0.603190
## [287] train-rmse:49.443111+0.298315 test-rmse:49.783119+0.596401
## [288] train-rmse:49.429506+0.289426 test-rmse:49.769931+0.591344
## [289] train-rmse:49.411895+0.292062 test-rmse:49.757387+0.597523
## [290] train-rmse:49.389354+0.294248 test-rmse:49.738389+0.611756
## [291] train-rmse:49.368772+0.297795 test-rmse:49.719640+0.622860
## [292] train-rmse:49.340714+0.283909 test-rmse:49.692721+0.618444
## [293] train-rmse:49.320961+0.278823 test-rmse:49.674466+0.607283
## [294] train-rmse:49.314610+0.282697 test-rmse:49.668728+0.612377
## [295] train-rmse:49.300537+0.277849 test-rmse:49.656495+0.608479
## [296] train-rmse:49.286605+0.283280 test-rmse:49.643806+0.605509
## [297] train-rmse:49.275747+0.277101 test-rmse:49.634159+0.596675
## [298] train-rmse:49.229699+0.281668 test-rmse:49.596139+0.574301
## [299] train-rmse:49.209023+0.291725 test-rmse:49.573944+0.571933
## [300] train-rmse:49.164716+0.293815 test-rmse:49.531079+0.578749
## [301] train-rmse:49.146106+0.285626 test-rmse:49.513206+0.558163
## [302] train-rmse:49.136925+0.283836 test-rmse:49.503960+0.561784
## [303] train-rmse:49.109552+0.283844 test-rmse:49.476731+0.564421
## [304] train-rmse:49.073856+0.257565 test-rmse:49.437303+0.537131
## [305] train-rmse:49.049789+0.265613 test-rmse:49.408540+0.542514
## [306] train-rmse:49.029791+0.255780 test-rmse:49.387674+0.533429
## [307] train-rmse:49.021435+0.251833 test-rmse:49.379007+0.527744
## [308] train-rmse:49.007050+0.248746 test-rmse:49.365310+0.525535
## [309] train-rmse:48.997660+0.246208 test-rmse:49.358204+0.524078
## [310] train-rmse:48.986972+0.241315 test-rmse:49.346693+0.529508
## [311] train-rmse:48.976488+0.238269 test-rmse:49.338820+0.533055
## [312] train-rmse:48.957183+0.230884 test-rmse:49.321145+0.531541
## [313] train-rmse:48.917265+0.224773 test-rmse:49.284210+0.527102
## [314] train-rmse:48.890188+0.245408 test-rmse:49.253327+0.540031
## [315] train-rmse:48.863295+0.225973 test-rmse:49.231574+0.521921
## [316] train-rmse:48.821355+0.207980 test-rmse:49.191054+0.521142
## [317] train-rmse:48.808292+0.196583 test-rmse:49.176090+0.517757
## [318] train-rmse:48.772804+0.197224 test-rmse:49.140060+0.525197
## [319] train-rmse:48.746310+0.219849 test-rmse:49.116804+0.530430
## [320] train-rmse:48.723188+0.223478 test-rmse:49.095551+0.549647
## [321] train-rmse:48.693346+0.212870 test-rmse:49.069517+0.545191
## [322] train-rmse:48.675566+0.222117 test-rmse:49.052229+0.546813
## [323] train-rmse:48.632635+0.215210 test-rmse:49.005653+0.545449
## [324] train-rmse:48.614469+0.212986 test-rmse:48.987033+0.557075
## [325] train-rmse:48.585699+0.221631 test-rmse:48.958753+0.543703
## [326] train-rmse:48.572150+0.221539 test-rmse:48.946046+0.551685
## [327] train-rmse:48.533051+0.244693 test-rmse:48.912639+0.551135
## [328] train-rmse:48.508351+0.247601 test-rmse:48.888423+0.542116
## [329] train-rmse:48.496827+0.249885 test-rmse:48.877426+0.543831
## [330] train-rmse:48.473542+0.256217 test-rmse:48.854418+0.534374
## [331] train-rmse:48.435336+0.269650 test-rmse:48.815055+0.546765
## [332] train-rmse:48.398496+0.270693 test-rmse:48.783446+0.568091
## [333] train-rmse:48.384039+0.270039 test-rmse:48.768703+0.559819
## [334] train-rmse:48.352772+0.267414 test-rmse:48.740104+0.582179
## [335] train-rmse:48.317041+0.278392 test-rmse:48.700070+0.582745
## [336] train-rmse:48.271014+0.263634 test-rmse:48.653796+0.581571
## [337] train-rmse:48.243069+0.262032 test-rmse:48.624698+0.574850
## [338] train-rmse:48.206546+0.256782 test-rmse:48.582775+0.593158
## [339] train-rmse:48.173983+0.235422 test-rmse:48.549086+0.570323
## [340] train-rmse:48.126806+0.243896 test-rmse:48.504181+0.585975
## [341] train-rmse:48.107737+0.232417 test-rmse:48.487309+0.580565
## [342] train-rmse:48.101207+0.236045 test-rmse:48.480873+0.582545
## [343] train-rmse:48.085998+0.229913 test-rmse:48.466236+0.581520
## [344] train-rmse:48.049837+0.227344 test-rmse:48.433509+0.587152
## [345] train-rmse:48.043540+0.228144 test-rmse:48.426248+0.591686
## [346] train-rmse:48.026740+0.233056 test-rmse:48.409055+0.600257
## [347] train-rmse:48.004036+0.240722 test-rmse:48.390861+0.606977
## [348] train-rmse:47.967174+0.235595 test-rmse:48.350300+0.595788
## [349] train-rmse:47.932734+0.235434 test-rmse:48.314904+0.571911
## [350] train-rmse:47.923112+0.237971 test-rmse:48.305856+0.579309
## [351] train-rmse:47.902938+0.242207 test-rmse:48.286165+0.589381
## [352] train-rmse:47.893122+0.239086 test-rmse:48.276899+0.587128
## [353] train-rmse:47.869678+0.240972 test-rmse:48.251558+0.573653
## [354] train-rmse:47.852990+0.248541 test-rmse:48.233590+0.573416
## [355] train-rmse:47.835669+0.250742 test-rmse:48.216615+0.568587
## [356] train-rmse:47.820683+0.257695 test-rmse:48.202918+0.580424
## [357] train-rmse:47.793758+0.243801 test-rmse:48.176515+0.569697
## [358] train-rmse:47.778933+0.249065 test-rmse:48.163044+0.579987
## [359] train-rmse:47.756398+0.251679 test-rmse:48.141452+0.568959
## [360] train-rmse:47.739446+0.249716 test-rmse:48.126617+0.562338
## [361] train-rmse:47.699373+0.231294 test-rmse:48.082723+0.545079
## [362] train-rmse:47.692692+0.229454 test-rmse:48.078347+0.540821
## [363] train-rmse:47.659250+0.234960 test-rmse:48.048442+0.549846
## [364] train-rmse:47.646098+0.227668 test-rmse:48.037004+0.554563
## [365] train-rmse:47.632850+0.233728 test-rmse:48.026209+0.558965
## [366] train-rmse:47.611175+0.227877 test-rmse:48.005803+0.561262
## [367] train-rmse:47.586992+0.218765 test-rmse:47.979775+0.555790
## [368] train-rmse:47.581754+0.219867 test-rmse:47.976356+0.556308
## [369] train-rmse:47.553657+0.227884 test-rmse:47.952426+0.566285
## [370] train-rmse:47.521702+0.216086 test-rmse:47.918669+0.550609
## [371] train-rmse:47.518039+0.215830 test-rmse:47.915419+0.548730
## [372] train-rmse:47.496125+0.225382 test-rmse:47.892706+0.563828
## [373] train-rmse:47.468260+0.212337 test-rmse:47.868168+0.564246
## [374] train-rmse:47.448251+0.206885 test-rmse:47.849864+0.569064
## [375] train-rmse:47.436414+0.203197 test-rmse:47.838811+0.575176
## [376] train-rmse:47.424658+0.201318 test-rmse:47.829083+0.568163
## [377] train-rmse:47.405887+0.192650 test-rmse:47.812922+0.572107
## [378] train-rmse:47.386599+0.180465 test-rmse:47.793985+0.558789
## [379] train-rmse:47.349698+0.186669 test-rmse:47.756111+0.553993
## [380] train-rmse:47.334275+0.187652 test-rmse:47.740899+0.560318
## [381] train-rmse:47.323605+0.187102 test-rmse:47.732846+0.552433
## [382] train-rmse:47.304820+0.195710 test-rmse:47.716014+0.565000
## [383] train-rmse:47.277525+0.198630 test-rmse:47.691018+0.560168
## [384] train-rmse:47.251091+0.212795 test-rmse:47.665132+0.562658
## [385] train-rmse:47.217116+0.202832 test-rmse:47.628777+0.547529
## [386] train-rmse:47.189932+0.202650 test-rmse:47.599563+0.548529
## [387] train-rmse:47.173037+0.205530 test-rmse:47.586651+0.555150
## [388] train-rmse:47.157772+0.188195 test-rmse:47.572461+0.550594
## [389] train-rmse:47.131156+0.171552 test-rmse:47.547495+0.530978
## [390] train-rmse:47.117274+0.161894 test-rmse:47.531893+0.524464
## [391] train-rmse:47.099124+0.158325 test-rmse:47.518009+0.531957
## [392] train-rmse:47.074843+0.164808 test-rmse:47.495008+0.526256
## [393] train-rmse:47.041917+0.171408 test-rmse:47.460796+0.527938
## [394] train-rmse:47.015490+0.177358 test-rmse:47.435241+0.526948
## [395] train-rmse:46.978819+0.162740 test-rmse:47.398736+0.526761
## [396] train-rmse:46.964785+0.151914 test-rmse:47.388799+0.520883
## [397] train-rmse:46.950314+0.149281 test-rmse:47.373952+0.514508
## [398] train-rmse:46.938541+0.153519 test-rmse:47.362632+0.527526
## [399] train-rmse:46.923992+0.154339 test-rmse:47.349711+0.538247
## [400] train-rmse:46.905430+0.159599 test-rmse:47.333259+0.549625
## [401] train-rmse:46.872646+0.142053 test-rmse:47.300134+0.536052
## [402] train-rmse:46.839009+0.137347 test-rmse:47.269876+0.538512
## [403] train-rmse:46.825924+0.140722 test-rmse:47.257595+0.540718
## [404] train-rmse:46.817986+0.142703 test-rmse:47.250834+0.537692
## [405] train-rmse:46.804420+0.146190 test-rmse:47.239553+0.532900
## [406] train-rmse:46.787158+0.151135 test-rmse:47.220282+0.522133
## [407] train-rmse:46.780307+0.153566 test-rmse:47.213490+0.524579
## [408] train-rmse:46.752274+0.144714 test-rmse:47.183689+0.513214
## [409] train-rmse:46.727256+0.141199 test-rmse:47.162344+0.508690
## [410] train-rmse:46.685495+0.135166 test-rmse:47.123695+0.514060
## [411] train-rmse:46.664700+0.141149 test-rmse:47.101035+0.514550
## [412] train-rmse:46.630422+0.127325 test-rmse:47.070627+0.499000
## [413] train-rmse:46.610133+0.131111 test-rmse:47.050126+0.490990
## [414] train-rmse:46.602250+0.134647 test-rmse:47.043330+0.492376
## [415] train-rmse:46.590089+0.139870 test-rmse:47.032632+0.487568
## [416] train-rmse:46.575823+0.144982 test-rmse:47.020403+0.487019
## [417] train-rmse:46.559308+0.151891 test-rmse:47.001225+0.497517
## [418] train-rmse:46.527756+0.165036 test-rmse:46.971448+0.512566
## [419] train-rmse:46.520385+0.163534 test-rmse:46.963651+0.510733
## [420] train-rmse:46.494781+0.153782 test-rmse:46.939815+0.499866
## [421] train-rmse:46.473816+0.141132 test-rmse:46.919823+0.499814
## [422] train-rmse:46.449073+0.131635 test-rmse:46.896594+0.496520
## [423] train-rmse:46.443901+0.133464 test-rmse:46.893488+0.494243
## [424] train-rmse:46.434006+0.132869 test-rmse:46.882677+0.489194
## [425] train-rmse:46.402466+0.145608 test-rmse:46.848725+0.498665
## [426] train-rmse:46.382356+0.132569 test-rmse:46.828101+0.500320
## [427] train-rmse:46.369642+0.135656 test-rmse:46.816460+0.511416
## [428] train-rmse:46.347415+0.128682 test-rmse:46.795274+0.517803
## [429] train-rmse:46.328780+0.134665 test-rmse:46.775673+0.509930
## [430] train-rmse:46.302149+0.133437 test-rmse:46.748001+0.508469
## [431] train-rmse:46.274665+0.147843 test-rmse:46.722698+0.509686
## [432] train-rmse:46.261323+0.151808 test-rmse:46.709088+0.509848
## [433] train-rmse:46.249735+0.158662 test-rmse:46.698956+0.507027
## [434] train-rmse:46.246740+0.157501 test-rmse:46.695615+0.507617
## [435] train-rmse:46.233844+0.154388 test-rmse:46.683808+0.495762
## [436] train-rmse:46.228359+0.149636 test-rmse:46.679462+0.494084
## [437] train-rmse:46.214059+0.143026 test-rmse:46.664547+0.492178
## [438] train-rmse:46.206525+0.144458 test-rmse:46.658119+0.492648
## [439] train-rmse:46.187209+0.153612 test-rmse:46.639668+0.485821
## [440] train-rmse:46.161612+0.167319 test-rmse:46.616022+0.495188
## [441] train-rmse:46.134639+0.167320 test-rmse:46.590932+0.511245
## [442] train-rmse:46.109484+0.177292 test-rmse:46.567701+0.512079
## [443] train-rmse:46.085227+0.177066 test-rmse:46.546091+0.519862
## [444] train-rmse:46.060860+0.169557 test-rmse:46.523013+0.516550
## [445] train-rmse:46.044945+0.171303 test-rmse:46.506530+0.507326
## [446] train-rmse:46.036119+0.173496 test-rmse:46.497330+0.514307
## [447] train-rmse:46.013606+0.182008 test-rmse:46.473806+0.517652
## [448] train-rmse:46.005001+0.174053 test-rmse:46.466906+0.516801
## [449] train-rmse:45.991470+0.159449 test-rmse:46.453999+0.515204
## [450] train-rmse:45.977358+0.159128 test-rmse:46.441499+0.514711
## [451] train-rmse:45.958541+0.169133 test-rmse:46.419887+0.510390
## [452] train-rmse:45.940621+0.174791 test-rmse:46.402671+0.508032
## [453] train-rmse:45.917406+0.158332 test-rmse:46.383033+0.513019
## [454] train-rmse:45.908152+0.155719 test-rmse:46.374632+0.513436
## [455] train-rmse:45.888639+0.148349 test-rmse:46.358321+0.513723
## [456] train-rmse:45.879127+0.144152 test-rmse:46.349227+0.515903
## [457] train-rmse:45.860329+0.136885 test-rmse:46.332120+0.518566
## [458] train-rmse:45.828136+0.148722 test-rmse:46.302358+0.534762
## [459] train-rmse:45.814710+0.148782 test-rmse:46.292790+0.539094
## [460] train-rmse:45.810117+0.148493 test-rmse:46.288121+0.540062
## [461] train-rmse:45.796974+0.150638 test-rmse:46.277497+0.544760
## [462] train-rmse:45.783106+0.159720 test-rmse:46.262244+0.545237
## [463] train-rmse:45.767663+0.162771 test-rmse:46.246498+0.546676
## [464] train-rmse:45.749870+0.164724 test-rmse:46.229823+0.539200
## [465] train-rmse:45.732860+0.169222 test-rmse:46.215170+0.546710
## [466] train-rmse:45.720686+0.174869 test-rmse:46.200266+0.561913
## [467] train-rmse:45.704133+0.165141 test-rmse:46.187659+0.564464
## [468] train-rmse:45.694306+0.168043 test-rmse:46.178003+0.566280
## [469] train-rmse:45.686076+0.170003 test-rmse:46.169870+0.568332
## [470] train-rmse:45.671399+0.167197 test-rmse:46.157477+0.567083
## [471] train-rmse:45.636745+0.172462 test-rmse:46.126946+0.581677
## [472] train-rmse:45.621850+0.171584 test-rmse:46.110007+0.569973
## [473] train-rmse:45.593334+0.168309 test-rmse:46.080480+0.587886
## [474] train-rmse:45.584219+0.167995 test-rmse:46.070905+0.591608
## [475] train-rmse:45.571954+0.167452 test-rmse:46.061178+0.594360
## [476] train-rmse:45.550243+0.179039 test-rmse:46.041114+0.614364
## [477] train-rmse:45.537283+0.172300 test-rmse:46.027288+0.610681
## [478] train-rmse:45.513496+0.175295 test-rmse:46.003350+0.612629
## [479] train-rmse:45.490511+0.161270 test-rmse:45.977981+0.593276
## [480] train-rmse:45.475863+0.164596 test-rmse:45.963698+0.596320
## [481] train-rmse:45.456282+0.166386 test-rmse:45.946568+0.597153
## [482] train-rmse:45.446410+0.162893 test-rmse:45.936001+0.588190
## [483] train-rmse:45.435695+0.153688 test-rmse:45.926766+0.584428
## [484] train-rmse:45.430102+0.154594 test-rmse:45.921065+0.581302
## [485] train-rmse:45.414578+0.157382 test-rmse:45.906327+0.586862
## [486] train-rmse:45.396398+0.151450 test-rmse:45.888352+0.576171
## [487] train-rmse:45.385769+0.150543 test-rmse:45.877299+0.569684
## [488] train-rmse:45.382031+0.151747 test-rmse:45.875248+0.570096
## [489] train-rmse:45.369416+0.158593 test-rmse:45.863530+0.577850
## [490] train-rmse:45.366287+0.157468 test-rmse:45.861518+0.578351
## [491] train-rmse:45.350783+0.153858 test-rmse:45.847524+0.574549
## [492] train-rmse:45.334451+0.143255 test-rmse:45.830926+0.564263
## [493] train-rmse:45.319819+0.145327 test-rmse:45.816966+0.569166
## [494] train-rmse:45.298932+0.139875 test-rmse:45.796420+0.571776
## [495] train-rmse:45.286228+0.142201 test-rmse:45.782812+0.566775
## [496] train-rmse:45.270842+0.138614 test-rmse:45.768091+0.567127
## [497] train-rmse:45.260438+0.141958 test-rmse:45.758467+0.568403
## [498] train-rmse:45.237271+0.144656 test-rmse:45.739698+0.565418
## [499] train-rmse:45.217601+0.141640 test-rmse:45.722453+0.573798
## [500] train-rmse:45.205877+0.146292 test-rmse:45.713568+0.577943
best_nrounds <- cv_results$best_iteration
# Train the final model using the best number of rounds found
model_xgb <- xgb.train(
params = params,
data = train_dmatrix,
nrounds = best_nrounds
)
# Make predictions and evaluate the model
train_pred <- predict(model_xgb, train_dmatrix)
test_pred <- predict(model_xgb, test_dmatrix)
train_rmse <- sqrt(mean((train_labels - train_pred)^2))
test_rmse <- sqrt(mean((test_labels - test_pred)^2))
# Calculate R-squared for the training set
sst_train <- sum((train_labels - mean(train_labels)) ^ 2)
ssr_train <- sum((train_labels - train_pred) ^ 2)
r_squared_train <- 1 - (ssr_train / sst_train)
# Calculate R-squared for the test set
sst_test <- sum((test_labels - mean(test_labels)) ^ 2)
ssr_test <- sum((test_labels - test_pred) ^ 2)
r_squared_test <- 1 - (ssr_test / sst_test)
train_mape <- mean(abs((train_labels - train_pred) / train_labels)) * 100
test_mape <- mean(abs((test_labels - test_pred) / test_labels)) * 100
train_mae <- mean(abs(train_labels - train_pred))
test_mae <- mean(abs(test_labels - test_pred))
cat("Model Performance Metrics:\n",
"--------------------------\n",
"Training RMSE: ", train_rmse, "\n",
"Test RMSE: ", test_rmse, "\n",
"Training R-squared: ", r_squared_train, "\n",
"Test R-squared: ", r_squared_test, "\n",
"Training MAE: ", train_mae, "\n",
"Test MAE: ", test_mae, "\n",
"Training MAPE: ", train_mape, "%\n",
"Test MAPE: ", test_mape, "%\n", sep="")
## Model Performance Metrics:
## --------------------------
## Training RMSE: 44.82021
## Test RMSE: 44.16044
## Training R-squared: 0.5532223
## Test R-squared: 0.554944
## Training MAE: 28.18775
## Test MAE: 27.78375
## Training MAPE: 230.3977%
## Test MAPE: 227.1307%
# Calculate feature importance
importance_matrix2 <- xgb.importance(feature_names = colnames(train_features), model = model_xgb)
# View the feature importance scores
print(importance_matrix2)
## Feature Gain Cover Frequency
## <char> <num> <num> <num>
## 1: POP_SQMI 0.340931387 0.393861289 0.336921734
## 2: BEAUTIFUL.GREENER 0.116209505 0.036435002 0.062263602
## 3: SOCAL 0.077321094 0.033451523 0.035205121
## 4: WEEKS_SINCE_LAUNCH 0.072864529 0.144839771 0.138201920
## 5: X12SMALL.12ONE.CUP 0.057392785 0.083301611 0.054989817
## 6: HILL.MOISTURE.THRASHED.APPLE 0.057310016 0.016375429 0.058481234
## 7: NORTHERN 0.044424316 0.019071063 0.025894676
## 8: BEAUTIFUL.GREENER..PLUM 0.033698503 0.011734788 0.023276113
## 9: SINGLE.GROUP 0.031417764 0.019404404 0.018329939
## 10: MOUNTAIN 0.027541042 0.042845101 0.029386093
## 11: KANSAS 0.025484787 0.024550632 0.018620890
## 12: ARIZONA 0.021486319 0.014530069 0.025312773
## 13: X12SMALL.24ONE.CUP 0.015837330 0.006719971 0.026476578
## 14: RAINING..THRASHED.PLUM 0.013351363 0.005598469 0.016002328
## 15: ZIZZLES..PLUM 0.012424925 0.009293321 0.008437591
## 16: X12SMALL.18ONE.CUP 0.010776187 0.007169854 0.009601397
## 17: CALI_NEVADA 0.007987276 0.022850582 0.014547571
## 18: DESERT_SW 0.006845436 0.007852210 0.015420425
## 19: NEWMEXICO 0.006266702 0.012127745 0.009019494
## 20: COLORADO 0.006081127 0.033611868 0.019784696
## 21: PRAIRIE 0.005389787 0.017191027 0.015711376
## 22: NOCAL 0.004704506 0.025138140 0.018620890
## 23: X12SMALL.6ONE.CUP 0.002718068 0.004951345 0.013383765
## 24: X12SMALL.20ONE.CUP 0.001535244 0.007094786 0.006109980
## Feature Gain Cover Frequency
xgb.plot.importance(importance_matrix = importance_matrix2)
>
# Define vectors for each category
str(one_hot_plum)
## tibble [43,366 × 37] (S3: tbl_df/tbl/data.frame)
## $ DATE : Date[1:43366], format: "2022-06-18" "2022-04-30" ...
## $ UNIT_SALES : num [1:43366] 1 14 18 13 19 4 29 35 75 25 ...
## $ DOLLAR_SALES : num [1:43366] 4.62 86.86 89.73 65.6 72.93 ...
## $ POP_SQMI : num [1:43366] 1.2 1.2 1.2 1.2 1.2 ...
## $ MONTH : num [1:43366] 6 4 12 7 11 6 1 11 10 12 ...
## $ SEASON : chr [1:43366] "SUMMER" "SPRING" "WINTER" "SUMMER" ...
## $ PACKAGE2 : chr [1:43366] "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" "CUP 12 LIQUID SMALL X12 NA" ...
## $ ENERGY_DRINK : num [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
## $ CALORIC_SEGMENT_TEXT : chr [1:43366] NA NA NA NA ...
## $ min_launch_date : Date[1:43366], format: "2021-09-04" "2021-09-04" ...
## $ WEEKS_SINCE_LAUNCH : num [1:43366] 41 34 14 47 12 40 20 9 6 13 ...
## $ NORTHERN : int [1:43366] 1 1 1 1 1 1 1 1 1 1 ...
## $ CALI_NEVADA : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
## $ DESERT_SW : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
## $ MOUNTAIN : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
## $ SOCAL : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
## $ PRAIRIE : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
## $ ARIZONA : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
## $ NEWMEXICO : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
## $ NOCAL : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
## $ COLORADO : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
## $ KANSAS : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
## $ HILL MOISTURE THRASHED APPLE: int [1:43366] 1 1 1 1 1 1 1 1 1 1 ...
## $ BEAUTIFUL GREENER : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
## $ SINGLE GROUP : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
## $ RAINING THRASHED PLUM : int [1:43366] 1 1 1 1 1 1 1 1 1 1 ...
## $ BEAUTIFUL GREENER PLUM : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
## $ ZIZZLES PLUM : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
## $ 12SMALL 12ONE CUP : int [1:43366] 1 1 1 1 1 1 1 1 1 1 ...
## $ 12SMALL 6ONE CUP : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
## $ 12SMALL 24ONE CUP : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
## $ 12SMALL 20ONE CUP : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
## $ 12SMALL 18ONE CUP : int [1:43366] 0 0 0 0 0 0 0 0 0 0 ...
## $ SUMMER : int [1:43366] 1 0 0 1 0 1 0 0 0 0 ...
## $ SPRING : int [1:43366] 0 1 0 0 0 0 0 0 0 0 ...
## $ WINTER : int [1:43366] 0 0 1 0 0 0 1 0 0 1 ...
## $ FALL : int [1:43366] 0 0 0 0 1 0 0 1 1 0 ...
# List to store unique values for each variable
regions <- 1:11
brands <- 1:3
items <- 1:3
package_options <- 1:5
# Create data frame with all combinations of categories
combinations <- expand.grid(Region = regions, Brand = brands, Item = items, Package = package_options)
# Duplicate each combination 52 times to represent each week of the year
final_df_replicated <- combinations[rep(row.names(combinations), each = 52), ]
# Add a column with values from 1 to 52 for each combination
final_df_replicated$Week_of_Year <- rep(1:52, times = nrow(combinations))
# Duplicate each combination 52 times to represent each week of the year
final_df_replicated <- final_df_replicated[rep(row.names(final_df_replicated), each = 13), ]
# Add a column with values from 1 to 13 for each combination
final_df_replicated$Week_Since_Launch <- rep(1:13, times = nrow(combinations))
final_df_replicated$Region <- unique_values_list$REGION[final_df_replicated$Region]
final_df_replicated$Brand <- unique_values_list$BRAND[final_df_replicated$Brand]
final_df_replicated$Item <- unique_values_list$ITEM[final_df_replicated$Item]
final_df_replicated$Package <- unique_values_list$PACKAGE[final_df_replicated$Package]
# List to store unique values for each variable
new_unique_values_list <- list()
# Columns to get unique values for
new_columns_to_get_unique_values <- c("Region", "Brand", "Item", "Package")
# Get unique values for each variable and store in the list
for (col in new_columns_to_get_unique_values) {
new_unique_values_list[[col]] <- unique(final_df_replicated[[col]])
}
# Loop over unique regions and create new columns
for (Region in new_unique_values_list$Region) {
final_df_replicated[[Region]] <- as.integer(final_df_replicated$Region == Region)
}
# Loop over unique regions and create new columns
for (Brand in new_unique_values_list$Brand) {
final_df_replicated[[Brand]] <- as.integer(final_df_replicated$Brand == Brand)
}
# Loop over unique regions and create new columns
for (Item in new_unique_values_list$Item) {
final_df_replicated[[Item]] <- as.integer(final_df_replicated$Item == Item)
}
# Loop over unique regions and create new columns
for (Package in new_unique_values_list$Package) {
final_df_replicated[[Package]] <- as.integer(final_df_replicated$Package == Package)
}
#Create dummy_data and remove non one hot encoded data
dummy_data <- final_df_replicated %>%
select(-Region, -Brand, -Item, -Package)
#add a Unit sales column
dummy_data$UNIT_SALES <- NA
dummy_data$UNIT_SALES <- as.numeric(dummy_data$UNIT_SALES)
str(final_df_replicated)
## 'data.frame': 334620 obs. of 28 variables:
## $ Region : chr "NORTHERN" "NORTHERN" "NORTHERN" "NORTHERN" ...
## $ Brand : chr "HILL MOISTURE THRASHED APPLE" "HILL MOISTURE THRASHED APPLE" "HILL MOISTURE THRASHED APPLE" "HILL MOISTURE THRASHED APPLE" ...
## $ Item : chr "RAINING THRASHED PLUM" "RAINING THRASHED PLUM" "RAINING THRASHED PLUM" "RAINING THRASHED PLUM" ...
## $ Package : chr "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" "12SMALL 12ONE CUP" ...
## $ Week_of_Year : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Week_Since_Launch : int 1 2 3 4 5 6 7 8 9 10 ...
## $ NORTHERN : int 1 1 1 1 1 1 1 1 1 1 ...
## $ CALI_NEVADA : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DESERT_SW : int 0 0 0 0 0 0 0 0 0 0 ...
## $ MOUNTAIN : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SOCAL : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PRAIRIE : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ARIZONA : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NEWMEXICO : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NOCAL : int 0 0 0 0 0 0 0 0 0 0 ...
## $ COLORADO : int 0 0 0 0 0 0 0 0 0 0 ...
## $ KANSAS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HILL MOISTURE THRASHED APPLE: int 1 1 1 1 1 1 1 1 1 1 ...
## $ BEAUTIFUL GREENER : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SINGLE GROUP : int 0 0 0 0 0 0 0 0 0 0 ...
## $ RAINING THRASHED PLUM : int 1 1 1 1 1 1 1 1 1 1 ...
## $ BEAUTIFUL GREENER PLUM : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ZIZZLES PLUM : int 0 0 0 0 0 0 0 0 0 0 ...
## $ 12SMALL 12ONE CUP : int 1 1 1 1 1 1 1 1 1 1 ...
## $ 12SMALL 6ONE CUP : int 0 0 0 0 0 0 0 0 0 0 ...
## $ 12SMALL 24ONE CUP : int 0 0 0 0 0 0 0 0 0 0 ...
## $ 12SMALL 20ONE CUP : int 0 0 0 0 0 0 0 0 0 0 ...
## $ 12SMALL 18ONE CUP : int 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "out.attrs")=List of 2
## ..$ dim : Named int [1:4] 11 3 3 5
## .. ..- attr(*, "names")= chr [1:4] "Region" "Brand" "Item" "Package"
## ..$ dimnames:List of 4
## .. ..$ Region : chr [1:11] "Region= 1" "Region= 2" "Region= 3" "Region= 4" ...
## .. ..$ Brand : chr [1:3] "Brand=1" "Brand=2" "Brand=3"
## .. ..$ Item : chr [1:3] "Item=1" "Item=2" "Item=3"
## .. ..$ Package: chr [1:5] "Package=1" "Package=2" "Package=3" "Package=4" ...
str(dummy_data)
## 'data.frame': 334620 obs. of 25 variables:
## $ Week_of_Year : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Week_Since_Launch : int 1 2 3 4 5 6 7 8 9 10 ...
## $ NORTHERN : int 1 1 1 1 1 1 1 1 1 1 ...
## $ CALI_NEVADA : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DESERT_SW : int 0 0 0 0 0 0 0 0 0 0 ...
## $ MOUNTAIN : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SOCAL : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PRAIRIE : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ARIZONA : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NEWMEXICO : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NOCAL : int 0 0 0 0 0 0 0 0 0 0 ...
## $ COLORADO : int 0 0 0 0 0 0 0 0 0 0 ...
## $ KANSAS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HILL MOISTURE THRASHED APPLE: int 1 1 1 1 1 1 1 1 1 1 ...
## $ BEAUTIFUL GREENER : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SINGLE GROUP : int 0 0 0 0 0 0 0 0 0 0 ...
## $ RAINING THRASHED PLUM : int 1 1 1 1 1 1 1 1 1 1 ...
## $ BEAUTIFUL GREENER PLUM : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ZIZZLES PLUM : int 0 0 0 0 0 0 0 0 0 0 ...
## $ 12SMALL 12ONE CUP : int 1 1 1 1 1 1 1 1 1 1 ...
## $ 12SMALL 6ONE CUP : int 0 0 0 0 0 0 0 0 0 0 ...
## $ 12SMALL 24ONE CUP : int 0 0 0 0 0 0 0 0 0 0 ...
## $ 12SMALL 20ONE CUP : int 0 0 0 0 0 0 0 0 0 0 ...
## $ 12SMALL 18ONE CUP : int 0 0 0 0 0 0 0 0 0 0 ...
## $ UNIT_SALES : num NA NA NA NA NA NA NA NA NA NA ...
## - attr(*, "out.attrs")=List of 2
## ..$ dim : Named int [1:4] 11 3 3 5
## .. ..- attr(*, "names")= chr [1:4] "Region" "Brand" "Item" "Package"
## ..$ dimnames:List of 4
## .. ..$ Region : chr [1:11] "Region= 1" "Region= 2" "Region= 3" "Region= 4" ...
## .. ..$ Brand : chr [1:3] "Brand=1" "Brand=2" "Brand=3"
## .. ..$ Item : chr [1:3] "Item=1" "Item=2" "Item=3"
## .. ..$ Package: chr [1:5] "Package=1" "Package=2" "Package=3" "Package=4" ...
#rename columes to match original features
dummy_data <- dummy_data %>%
rename(
`BEAUTIFUL.GREENER` = `BEAUTIFUL GREENER`,
`X12SMALL.12ONE.CUP` = `12SMALL 12ONE CUP`, `HILL.MOISTURE.THRASHED.APPLE` = `HILL MOISTURE THRASHED APPLE`,
`BEAUTIFUL.GREENER..PLUM` = `BEAUTIFUL GREENER PLUM`,
`X12SMALL.24ONE.CUP` = `12SMALL 24ONE CUP`,
`X12SMALL.6ONE.CUP` = `12SMALL 6ONE CUP`,
`X12SMALL.20ONE.CUP` = `12SMALL 20ONE CUP`,
`RAINING..THRASHED.PLUM`=`RAINING THRASHED PLUM`,
`X12SMALL.18ONE.CUP`=`12SMALL 18ONE CUP`,
)
# Check for Matching Features
#Get the column names of Test and dummy_data
names_Test <- names(Test)
names_dummy_data <- names(dummy_data)
# Find the matching column names
matching_names <- intersect(names_Test, names_dummy_data)
# Find the non-matching column names
non_matching_names_Test <- setdiff(names_Test, matching_names)
non_matching_names_dummy_data <- setdiff(names_dummy_data, matching_names)
#Print the matching and non-matching column names
cat("Matching column names:", paste(matching_names, collapse = ", "), "\n")
## Matching column names: UNIT_SALES, NORTHERN, CALI_NEVADA, DESERT_SW, MOUNTAIN, SOCAL, PRAIRIE, ARIZONA, NEWMEXICO, NOCAL, COLORADO, KANSAS, HILL.MOISTURE.THRASHED.APPLE, BEAUTIFUL.GREENER, RAINING..THRASHED.PLUM, BEAUTIFUL.GREENER..PLUM, X12SMALL.12ONE.CUP, X12SMALL.6ONE.CUP, X12SMALL.24ONE.CUP, X12SMALL.20ONE.CUP, X12SMALL.18ONE.CUP
cat("Non-matching column names in Test:", paste(non_matching_names_Test, collapse = ", "), "\n")
## Non-matching column names in Test: POP_SQMI, PACKAGE2, ENERGY_DRINK, CALORIC_SEGMENT_TEXT, min_launch_date, WEEKS_SINCE_LAUNCH, SINGLE.GROUP, ZIZZLES..PLUM
cat("Non-matching column names in dummy_data:", paste(non_matching_names_dummy_data, collapse = ", "), "\n")
## Non-matching column names in dummy_data: Week_of_Year, Week_Since_Launch, SINGLE GROUP, ZIZZLES PLUM
# Adding the non-matching columns to dummy_data with default values
for (col in non_matching_names_Test) {
dummy_data[[col]] <- NA # You can change NA to any default value you prefer
}
# Get the column names of the Test dataframe
test_colnames <- colnames(Test)
# Reorder columns of dummy_data to match the order of columns in Test
dummy_data <- dummy_data %>%
select(all_of(test_colnames))
# Prepare features for XGBoost
dummy_features <- dummy_data[, -which(names(dummy_data) == "UNIT_SALES")]
# Convert data to DMatrix format
dummy_dmatrix<- xgb.DMatrix(data = as.matrix(dummy_features))
dummy_pred <- predict(model_xgb, dummy_dmatrix)
# Add the predictions to dummy_data
dummy_data$Predictions <- dummy_pred
# Convert predictions to integers
dummy_data$Predictions <- round(dummy_pred)
# Convert to integer data type
dummy_data$Predictions <- as.integer(dummy_data$Predictions)
# Apply post-processing to ensure non-negative predictions
dummy_pred <- predict(model_xgb, dummy_dmatrix)
# Set negative predictions to zero
dummy_pred <- pmax(dummy_pred, 0)
# Add the adjusted predictions to dummy_data
dummy_data$Predictions <- dummy_pred
summary(dummy_data$Predictions)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 6.803 22.251 35.879 154.412
ggplot(dummy_data, aes(x = Predictions)) +
geom_density(fill = "blue", alpha = 0.5) +
labs(title = "Density Plot of Predicted Values",
x = "Predicted Values",
y = "Density")
plum %>%
summarize(n = n(),
AVG_UNIT_SALES = mean(UNIT_SALES))
## # A tibble: 1 × 2
## n AVG_UNIT_SALES
## <int> <dbl>
## 1 43366 55.9
# Prediction Results
summary(dummy_data$Predictions)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 6.803 22.251 35.879 154.412
dummy_data %>%
summarize(n = n(),
AVG_UNIT_SALES = mean(Predictions))
## n AVG_UNIT_SALES
## 1 334620 22.25126
# #stop cluster
# doParallel::stopImplicitCluster()
#cleanup all obj
rm(list = ls())
Dummy data for unseen innovation products does not provide the most confident results.
#Reference Q information for this flava flav > Item Description: Diet Energy Moonlit Casava 2L Multi Jug Caloric Segment: Diet Market Category: Energy Manufacturer: Swire-CC Brand: Diet Moonlit Package Type: 2L Multi Jug Flavor: ‘Cassava’ Swire plans to release this product for 6 months. What will the forecasted demand be, in weeks, for this product?
df <- readRDS("swire_no_nas.rds") #inject the data and we will sub-sample
# Update CALORIC_SEGMENT values: 0 if 'DIET/LIGHT', otherwise 1
df$CALORIC_SEGMENT <- ifelse(df$CALORIC_SEGMENT == "DIET/LIGHT", 0, 1)
df$MARKET_KEY <- as.character(df$MARKET_KEY)
df <- df %>%
mutate(
MONTH = as.numeric(substr(DATE, 6, 7)), # Extract the month from YYYY-MM-DD format
SEASON = case_when(
MONTH %in% c(12, 01, 02) ~ "WINTER",
MONTH %in% c(03, 04, 05) ~ "SPRING",
MONTH %in% c(06, 07, 08) ~ "SUMMER",
MONTH %in% c(09, 10, 11) ~ "FALL",
TRUE ~ NA_character_ # This is just in case there are any undefined values
)
)
regions_joinme <- read.csv("states_summary.csv")
unique(regions_joinme$REGION)
## [1] "NORTHERN" "DESERT_SW" "PRAIRIE" "CALI_NEVADA" "MOUNTAIN"
## [6] "SOCAL" "ARIZONA" "NEWMEXICO" "NOCAL" "COLORADO"
## [11] "KANSAS"
# "NORTHERN" "DESERT_SW" "PRAIRIE" "CALI_NEVADA" "MOUNTAIN" "SOCAL" "ARIZONA" "NEWMEXICO" "NOCAL" "COLORADO" "KANSAS"
str(regions_joinme)
## 'data.frame': 200 obs. of 2 variables:
## $ MARKET_KEY: int 13 70 179 197 272 352 32 33 44 50 ...
## $ REGION : chr "NORTHERN" "NORTHERN" "DESERT_SW" "DESERT_SW" ...
# Perform a left join using the merge() function
df <- merge(df, regions_joinme[, c("MARKET_KEY", "REGION")], by = "MARKET_KEY", all.x = TRUE)
rm(regions_joinme)
str(df)
## 'data.frame': 24461424 obs. of 13 variables:
## $ MARKET_KEY : chr "1" "1" "1" "1" ...
## $ DATE : chr "2021-10-16" "2022-06-04" "2022-02-05" "2022-10-08" ...
## $ CALORIC_SEGMENT: num 0 0 1 0 0 1 0 0 1 0 ...
## $ CATEGORY : chr "ENERGY" "SSD" "SSD" "SSD" ...
## $ UNIT_SALES : num 434 28 42 1 26 161 6 5 68 90 ...
## $ DOLLAR_SALES : num 924.04 147.77 25.13 0.99 94.56 ...
## $ MANUFACTURER : chr "PONYS" "SWIRE-CC" "COCOS" "JOLLYS" ...
## $ BRAND : chr "MYTHICAL BEVERAGE ULTRA" "DIET PEPPY CF" "HANSENIZZLE'S ECO" "DIET PAPI" ...
## $ PACKAGE : chr "16SMALL MULTI CUP" "12SMALL 12ONE CUP" "12SMALL 6ONE CUP" "12SMALL 6ONE CUP" ...
## $ ITEM : chr "MYTHICAL BEVERAGE ULTRA SUNRISE ENERGY DRINK UNFLAVORED ZERO SUGAR CUP 16 LIQUID SMALL" "DIET PEPPY CAFFEINE FREE GENTLE DRINK RED PEPPER COLA DIET CUP 12 LIQUID SMALL X12" "HANSENIZZLE'S ECO GENTLE DRINK MANDARIN DURIAN CUP 12 LIQUID SMALL" "DIET PAPI GENTLE DRINK COLA DIET CUP 12 LIQUID SMALL" ...
## $ MONTH : num 10 6 2 10 7 9 9 6 10 5 ...
## $ SEASON : chr "FALL" "SUMMER" "WINTER" "FALL" ...
## $ REGION : chr "NORTHERN" "NORTHERN" "NORTHERN" "NORTHERN" ...
# Assuming df is your dataframe
set.seed(123) # Set a random seed for reproducibility
sampled_df <- df[sample(1:nrow(df), 2446143), ]
rm(df)
df <- sampled_df
rm(sampled_df)
#skim(df)
summary(df)
## MARKET_KEY DATE CALORIC_SEGMENT CATEGORY
## Length:2446143 Length:2446143 Min. :0.0000 Length:2446143
## Class :character Class :character 1st Qu.:0.0000 Class :character
## Mode :character Mode :character Median :1.0000 Mode :character
## Mean :0.5024
## 3rd Qu.:1.0000
## Max. :1.0000
## UNIT_SALES DOLLAR_SALES MANUFACTURER BRAND
## Min. : 0.04 Min. : 0.0 Length:2446143 Length:2446143
## 1st Qu.: 11.00 1st Qu.: 36.5 Class :character Class :character
## Median : 40.00 Median : 134.9 Mode :character Mode :character
## Mean : 173.87 Mean : 590.0
## 3rd Qu.: 126.00 3rd Qu.: 427.3
## Max. :92448.00 Max. :392062.7
## PACKAGE ITEM MONTH SEASON
## Length:2446143 Length:2446143 Min. : 1.000 Length:2446143
## Class :character Class :character 1st Qu.: 3.000 Class :character
## Mode :character Mode :character Median : 6.000 Mode :character
## Mean : 6.287
## 3rd Qu.: 9.000
## Max. :12.000
## REGION
## Length:2446143
## Class :character
## Mode :character
##
##
##
# Perform a linear regression with UNIT_SALES as the dependent variable
# and PRICE (or your chosen variable) as the independent variable
linear_model <- lm(DOLLAR_SALES ~ UNIT_SALES, data = df)
# Print the summary of the linear model to see the results
summary(linear_model)
##
## Call:
## lm(formula = DOLLAR_SALES ~ UNIT_SALES, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -107746 -112 -60 3 239824
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.563703 1.050071 57.68 <2e-16 ***
## UNIT_SALES 3.045165 0.001204 2528.31 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1609 on 2446141 degrees of freedom
## Multiple R-squared: 0.7232, Adjusted R-squared: 0.7232
## F-statistic: 6.392e+06 on 1 and 2446141 DF, p-value: < 2.2e-16
# Create a scatter plot with the regression line, colored by MANUFACTURER
ggplot(df, aes(x = UNIT_SALES, y = DOLLAR_SALES, color = MANUFACTURER)) +
geom_point(alpha = 0.5) + # Adjust alpha to avoid overplotting, if necessary
geom_smooth(method = "lm", color = "black", se = FALSE) + # Add linear regression line without confidence band for clarity
labs(title = "Linear Model of UNIT_SALES vs. DOLLAR_SALES by MANUFACTURER",
x = "UNIT SALES",
y = "DOLLAR SALES") +
theme_minimal() +
theme(legend.position = "bottom") # Adjust legend position if needed
## `geom_smooth()` using formula = 'y ~ x'
# create a table of total values by brand
brand_summary <- df %>%
group_by(BRAND) %>%
summarise(
total_units_sold = sum(UNIT_SALES),
total_revenue = sum(DOLLAR_SALES),
avg_price = total_revenue / total_units_sold,
total_days_sold = n() # Count the number of rows for each brand
) %>%
arrange(desc(total_revenue)) %>% # Order by revenue in descending order
mutate(rank = row_number())
summary(brand_summary)
## BRAND total_units_sold total_revenue avg_price
## Length:286 Min. : 1 Min. : 1 Min. : 0.1658
## Class :character 1st Qu.: 2575 1st Qu.: 8778 1st Qu.: 2.0501
## Mode :character Median : 98109 Median : 296741 Median : 2.9876
## Mean : 1487092 Mean : 5046437 Mean : 3.2334
## 3rd Qu.: 669287 3rd Qu.: 2159509 3rd Qu.: 3.7412
## Max. :41531681 Max. :164499278 Max. :42.9411
## total_days_sold rank
## Min. : 1.0 Min. : 1.00
## 1st Qu.: 134.8 1st Qu.: 72.25
## Median : 2116.0 Median :143.50
## Mean : 8553.0 Mean :143.50
## 3rd Qu.: 8275.0 3rd Qu.:214.75
## Max. :123829.0 Max. :286.00
print(brand_summary[brand_summary$BRAND == "DIET MOONLIT", ])
## # A tibble: 1 × 6
## BRAND total_units_sold total_revenue avg_price total_days_sold rank
## <chr> <dbl> <dbl> <dbl> <int> <int>
## 1 DIET MOONLIT 747385. 2502014. 3.35 7587 70
Diet Moonlit is a rising star ranking 69 out of 288 brands in terms of total revenue, with an average price of $3.50 slightly above the overall mean of $3.27.
# Filter the dataframe for only 'DIET SMASH'
filtered_df <- df %>%
filter(BRAND == "DIET MOONLIT")
# Create the plot
ggplot(filtered_df, aes(x = UNIT_SALES, y = DOLLAR_SALES)) +
geom_point(color = "red", alpha = 1) + # Bright red points with full opacity
geom_smooth(method = "lm", color = "black", se = FALSE) + # Add linear regression line without confidence band
labs(title = "Linear Model of UNIT_SALES vs. DOLLAR_SALES for DIET SMASH",
x = "UNIT SALES",
y = "DOLLAR SALES") +
theme_minimal() +
theme(legend.position = "none")
## `geom_smooth()` using formula = 'y ~ x'
DIET MOONLIT has a tight cluster below 1,000 unit sales and $2,500 revenue, but there are some remarkable high fliers nearing $20,000 and just over 3000 units.
filtered_df %>%
mutate(DATE = as.Date(DATE)) %>%
mutate(WEEK = as.integer(format(DATE, "%U"))) %>%
group_by(WEEK) %>%
summarise(total_sales = sum(UNIT_SALES)) %>%
ggplot(aes(x = WEEK, y = total_sales)) +
geom_line(color = "black") + # Blue line connecting points
labs(title = "Total Sales by Week of the Year",
x = "Week of the Year",
y = "Total Unit Sales") +
theme_minimal()
> DIET MOONLIT shows many peaks and valleys in sales by week.
library(zoo)
# Calculate total sales for each group of 211 consecutive weeks (6 months)
sales_by_group <- filtered_df %>%
mutate(DATE = as.Date(DATE)) %>%
mutate(WEEK = as.integer(format(DATE, "%U"))) %>%
group_by(WEEK) %>%
summarise(total_sales = sum(UNIT_SALES)) %>%
mutate(sales_in_group = rollsum(total_sales, 21, align = "left", fill = NA)) %>%
mutate(week_label = paste0("Week ", WEEK + 1, " to Week ", WEEK + 21)) %>%
arrange(WEEK) %>% # Order by WEEK
filter(!is.na(sales_in_group)) # Remove rows with sales_in_group = NA
# Plot the bar chart
sales_by_group$week_label <- factor(sales_by_group$week_label, levels = sales_by_group$week_label[order(sales_by_group$WEEK)])
ggplot(sales_by_group, aes(x = factor(week_label), y = sales_in_group)) +
geom_bar(stat = "identity", fill = "black") +
labs(title = "Total Sales for Each 6-month Grouping",
x = "Weeks (Starting from Week 1)",
y = "Total Sales") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
> DIET MOONLIT has it’s best 6 month runs week 7 - 27
historically.
#find the best 21 weeks for Casava sales
# Calculate total sales for each group of 21 consecutive weeks
sales_by_casava <- df %>%
filter(str_detect(ITEM, "CASAVA")) %>%
mutate(DATE = as.Date(DATE)) %>%
mutate(WEEK = as.integer(format(DATE, "%U"))) %>%
group_by(WEEK) %>%
summarise(total_sales = sum(UNIT_SALES)) %>%
mutate(sales_in_group = rollsum(total_sales, 21, align = "left", fill = NA)) %>%
mutate(week_label = paste0("Week ", WEEK + 1, " to Week ", WEEK + 21)) %>%
arrange(WEEK) %>% # Order by WEEK
filter(!is.na(sales_in_group)) # Remove rows with sales_in_group = NA
# Plot the bar chart
sales_by_casava$week_label <- factor(sales_by_casava$week_label, levels = sales_by_casava$week_label[order(sales_by_casava$WEEK)])
ggplot(sales_by_casava, aes(x = factor(week_label), y = sales_in_group)) +
geom_bar(stat = "identity", fill = "black") +
labs(title = "Total Sales for Each 21-Week Grouping",
x = "Weeks (Starting from Week 1)",
y = "Total Sales") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
> Casava sales are best in the 21 weeks from week 14 to 34.
#find the best 21 weeks for casava, energy, diet, s
# Calculate total sales for each group of 21 consecutive weeks
sales_by_innovation <- df %>%
filter(CATEGORY == "ENERGY",
str_detect(ITEM, "CASAVA")) %>%
mutate(DATE = as.Date(DATE)) %>%
mutate(WEEK = as.integer(format(DATE, "%U"))) %>%
group_by(WEEK) %>%
summarise(total_sales = sum(UNIT_SALES)) %>%
mutate(sales_in_group = rollsum(total_sales, 13, align = "left", fill = NA)) %>%
mutate(week_label = paste0("Week ", WEEK + 1, " to Week ", WEEK + 13)) %>%
arrange(WEEK) %>% # Order by WEEK
filter(!is.na(sales_in_group)) # Remove rows with sales_in_group = NA
# Plot the bar chart
sales_by_innovation$week_label <- factor(sales_by_innovation$week_label, levels = sales_by_innovation$week_label[order(sales_by_innovation$WEEK)])
ggplot(sales_by_innovation, aes(x = factor(week_label), y = sales_in_group)) +
geom_bar(stat = "identity", fill = "black") +
labs(title = "Total Sales for Each 13-Week Grouping",
x = "Weeks (Starting from Week 1)",
y = "Total Sales") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
#create innovation based on Energy, Casava
innovation<- df %>%
filter(CATEGORY == "ENERGY",
str_detect(ITEM, "CASAVA"))
#unique PACKAGE string from innovation
print(unique(innovation$PACKAGE))
## [1] "16SMALL MULTI CUP" "16SMALL 24ONE CUP"
library(dplyr)
library(lubridate)
innovation <- innovation %>%
mutate(
MONTH = month(ymd(DATE)), # Extract month using lubridate's ymd function
MONTH = as.factor(MONTH) # Convert the extracted month into a factor
)
str(innovation)
## 'data.frame': 5045 obs. of 13 variables:
## $ MARKET_KEY : chr "59" "133" "965" "303" ...
## $ DATE : chr "2022-07-23" "2023-01-28" "2022-02-12" "2022-10-22" ...
## $ CALORIC_SEGMENT: num 1 1 1 1 1 1 1 1 1 1 ...
## $ CATEGORY : chr "ENERGY" "ENERGY" "ENERGY" "ENERGY" ...
## $ UNIT_SALES : num 1 55 186 123 159 3 54 17 102 58 ...
## $ DOLLAR_SALES : num 2.49 145.36 417.21 210.09 407.14 ...
## $ MANUFACTURER : chr "JOLLYS" "PONYS" "PONYS" "JOLLYS" ...
## $ BRAND : chr "SUPER-DUPER RECOVERY" "MYTHICAL BEVERAGE" "MYTHICAL BEVERAGE" "SUPER-DUPER RECOVERY" ...
## $ PACKAGE : chr "16SMALL MULTI CUP" "16SMALL MULTI CUP" "16SMALL MULTI CUP" "16SMALL MULTI CUP" ...
## $ ITEM : chr "SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK CUP 16 LIQUID SMALL" "MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA CUP 16 LIQUID SMALL" "MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA CUP 16 LIQUID SMALL" "SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK CUP 16 LIQUID SMALL" ...
## $ MONTH : Factor w/ 12 levels "1","2","3","4",..: 7 1 2 10 11 8 3 3 6 6 ...
## $ SEASON : chr "SUMMER" "WINTER" "WINTER" "FALL" ...
## $ REGION : chr "CALI_NEVADA" "MOUNTAIN" "COLORADO" "MOUNTAIN" ...
print(unique(innovation$ITEM))
## [1] "SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK CUP 16 LIQUID SMALL"
## [2] "MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA CUP 16 LIQUID SMALL"
## [3] "SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA CUP 16 LIQUID SMALL"
## [4] "SUPER-DUPER PITAYA ED ENERGY DRINK CASAVA NO ARTIFICIAL SWEETENERS CUP 16 LIQUID SMALL"
## [5] "MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA CUP 16 LIQUID SMALL X24"
## [6] "SUPER-DUPER PURE ZERO ENERGY DRINK CASAVA YELLOW SUGAR FREE CUP 16 LIQUID SMALL"
# Count the number of unique PACKAGE column of our sample
table(innovation$PACKAGE)
##
## 16SMALL 24ONE CUP 16SMALL MULTI CUP
## 2 5043
# Creating an 'innovation' data frame
#factor Region
model <- lm(DOLLAR_SALES ~ UNIT_SALES + CALORIC_SEGMENT + PACKAGE + REGION, data = innovation)
summary(model)
##
## Call:
## lm(formula = DOLLAR_SALES ~ UNIT_SALES + CALORIC_SEGMENT + PACKAGE +
## REGION, data = innovation)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1001.45 -11.82 1.66 16.23 1053.09
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.760063 70.086746 0.325 0.74539
## UNIT_SALES 2.294841 0.004102 559.512 < 2e-16 ***
## CALORIC_SEGMENT -5.198822 57.246299 -0.091 0.92764
## PACKAGE16SMALL MULTI CUP -22.051746 40.400212 -0.546 0.58521
## REGIONCALI_NEVADA 2.091218 4.713024 0.444 0.65727
## REGIONCOLORADO 4.879770 2.979656 1.638 0.10155
## REGIONDESERT_SW -1.086808 3.561049 -0.305 0.76023
## REGIONKANSAS -5.588681 7.329994 -0.762 0.44583
## REGIONMOUNTAIN -22.267932 3.006636 -7.406 1.52e-13 ***
## REGIONNEWMEXICO -0.872840 4.491293 -0.194 0.84592
## REGIONNOCAL 8.468503 4.357528 1.943 0.05202 .
## REGIONNORTHERN -10.153914 2.297398 -4.420 1.01e-05 ***
## REGIONPRAIRIE -7.516506 5.472385 -1.374 0.16965
## REGIONSOCAL 9.365306 3.471712 2.698 0.00701 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 57.1 on 5031 degrees of freedom
## Multiple R-squared: 0.9877, Adjusted R-squared: 0.9876
## F-statistic: 3.099e+04 on 13 and 5031 DF, p-value: < 2.2e-16
Cassava and Energy together do quite well (not possible to also add in DIET, but we will expect that folks that like Cassava Regular Energy will also like DIET). R2 of 0.98. Summer is statistically signficant, but negatively correlated with sales. SOCAL and NOCAL are significant in the positive direction.
#More exploration
library(dplyr)
small_group <- df %>%
filter(UNIT_SALES < 7000, DOLLAR_SALES < 20000)
skim(small_group)
| Name | small_group |
| Number of rows | 2440779 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| MARKET_KEY | 0 | 1 | 1 | 4 | 0 | 200 | 0 |
| DATE | 0 | 1 | 10 | 10 | 0 | 152 | 0 |
| CATEGORY | 0 | 1 | 3 | 18 | 0 | 5 | 0 |
| MANUFACTURER | 0 | 1 | 5 | 8 | 0 | 8 | 0 |
| BRAND | 0 | 1 | 4 | 56 | 0 | 286 | 0 |
| PACKAGE | 0 | 1 | 11 | 26 | 0 | 95 | 0 |
| ITEM | 0 | 1 | 26 | 142 | 0 | 2984 | 0 |
| SEASON | 0 | 1 | 4 | 6 | 0 | 4 | 0 |
| REGION | 0 | 1 | 5 | 11 | 0 | 11 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| CALORIC_SEGMENT | 0 | 1 | 0.50 | 0.50 | 0.00 | 0.00 | 1.00 | 1.00 | 1.0 | ▇▁▁▁▇ |
| UNIT_SALES | 0 | 1 | 149.97 | 382.85 | 0.04 | 11.00 | 40.00 | 125.00 | 6998.0 | ▇▁▁▁▁ |
| DOLLAR_SALES | 0 | 1 | 496.94 | 1222.46 | 0.01 | 36.36 | 134.34 | 423.52 | 19999.4 | ▇▁▁▁▁ |
| MONTH | 0 | 1 | 6.29 | 3.43 | 1.00 | 3.00 | 6.00 | 9.00 | 12.0 | ▇▆▆▅▇ |
skim(df %>% filter(BRAND == "DIET MOONLIT"))
| Name | df %>% filter(BRAND == “D… |
| Number of rows | 7587 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| MARKET_KEY | 0 | 1 | 1 | 4 | 0 | 200 | 0 |
| DATE | 0 | 1 | 10 | 10 | 0 | 147 | 0 |
| CATEGORY | 0 | 1 | 3 | 3 | 0 | 1 | 0 |
| MANUFACTURER | 0 | 1 | 8 | 8 | 0 | 1 | 0 |
| BRAND | 0 | 1 | 12 | 12 | 0 | 1 | 0 |
| PACKAGE | 0 | 1 | 12 | 17 | 0 | 5 | 0 |
| ITEM | 0 | 1 | 50 | 63 | 0 | 5 | 0 |
| SEASON | 0 | 1 | 4 | 6 | 0 | 4 | 0 |
| REGION | 0 | 1 | 5 | 11 | 0 | 11 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| CALORIC_SEGMENT | 0 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ▁▁▇▁▁ |
| UNIT_SALES | 0 | 1 | 98.51 | 290.67 | 1.00 | 23.00 | 51.00 | 92.00 | 4720.00 | ▇▁▁▁▁ |
| DOLLAR_SALES | 0 | 1 | 329.78 | 921.76 | 0.75 | 56.74 | 133.91 | 321.35 | 21447.56 | ▇▁▁▁▁ |
| MONTH | 0 | 1 | 6.38 | 3.41 | 1.00 | 4.00 | 6.00 | 9.00 | 12.00 | ▇▆▆▅▇ |
Our small df has a higher mean of unit sales and dollar sales of 149 and $496. as compared to the full df of DIET MOONLIT of 98 and $344.
# Create a scatter plot with the regression line, colored by MANUFACTURER
ggplot(small_group, aes(x = UNIT_SALES, y = DOLLAR_SALES, color = MANUFACTURER)) +
geom_point(alpha = 0.5) + # Adjust alpha to avoid overplotting, if necessary
geom_smooth(method = "lm", color = "black", se = FALSE) + # Add linear regression line without confidence band for clarity
labs(title = "Linear Model of UNIT_SALES vs. DOLLAR_SALES by MANUFACTURER",
x = "UNTI SALES",
y = "DOLLAR SALES") +
theme_minimal() +
theme(legend.position = "bottom") # Adjust legend position if needed
## `geom_smooth()` using formula = 'y ~ x'
Behold the realm of DIET MOONLIT. Certain items sell much better, or wosrse with consideration of slop of dollars to units sold. While most of its realm is in the lower left hand portion, other brands have sales through both its unit and dollar sales vectors.
#Make the small casava df > Investigating drinks with casava as a flavor in the Item description.
# Create a new data frame with only the rows where the ITEM column contains the word 'casava'
casava_small <- df[grep("casava", df$ITEM, ignore.case = TRUE), ]
skim(casava_small)
| Name | casava_small |
| Number of rows | 42420 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| MARKET_KEY | 0 | 1 | 1 | 4 | 0 | 200 | 0 |
| DATE | 0 | 1 | 10 | 10 | 0 | 152 | 0 |
| CATEGORY | 0 | 1 | 3 | 18 | 0 | 4 | 0 |
| MANUFACTURER | 0 | 1 | 5 | 8 | 0 | 5 | 0 |
| BRAND | 0 | 1 | 5 | 26 | 0 | 28 | 0 |
| PACKAGE | 0 | 1 | 12 | 26 | 0 | 25 | 0 |
| ITEM | 0 | 1 | 46 | 112 | 0 | 83 | 0 |
| SEASON | 0 | 1 | 4 | 6 | 0 | 4 | 0 |
| REGION | 0 | 1 | 5 | 11 | 0 | 11 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| CALORIC_SEGMENT | 0 | 1 | 0.61 | 0.49 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▅▁▁▁▇ |
| UNIT_SALES | 0 | 1 | 73.35 | 183.67 | 1.00 | 9.00 | 30.00 | 79.00 | 6678.00 | ▇▁▁▁▁ |
| DOLLAR_SALES | 0 | 1 | 188.68 | 405.78 | 0.35 | 23.32 | 75.85 | 213.08 | 12886.48 | ▇▁▁▁▁ |
| MONTH | 0 | 1 | 6.47 | 3.35 | 1.00 | 4.00 | 7.00 | 9.00 | 12.00 | ▇▆▆▅▇ |
Casava has a much lower unit sales and dollar sales at 71 and $184 than Diet Moonlight at 98 and $344.
# casava small is dataframe
#factor Region
casava_small$REGION <- as.factor(casava_small$REGION)
model <- lm(DOLLAR_SALES ~ UNIT_SALES + CALORIC_SEGMENT + PACKAGE + CATEGORY + SEASON, data = casava_small)
summary(model)
##
## Call:
## lm(formula = DOLLAR_SALES ~ UNIT_SALES + CALORIC_SEGMENT + PACKAGE +
## CATEGORY + SEASON, data = casava_small)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3376.3 -31.4 -3.6 26.7 7209.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.764e+01 3.544e+01 2.755 0.005872 **
## UNIT_SALES 2.006e+00 4.145e-03 483.899 < 2e-16 ***
## CALORIC_SEGMENT 1.222e+01 3.415e+01 0.358 0.720615
## PACKAGE.5L 6ONE JUG -4.559e+00 7.168e+00 -0.636 0.524707
## PACKAGE.5L MULTI JUG -3.817e+00 5.248e+01 -0.073 0.942017
## PACKAGE12SMALL 12ONE CUP 2.202e+02 8.707e+00 25.291 < 2e-16 ***
## PACKAGE12SMALL 24ONE PLASTICS JUG -2.275e+01 1.090e+02 -0.209 0.834658
## PACKAGE12SMALL 4ONE PET -3.135e+01 1.539e+02 -0.204 0.838570
## PACKAGE12SMALL 6ONE CUP -4.787e+01 3.128e+01 -1.530 0.125948
## PACKAGE12SMALL 6ONE MEDIUM CUP -1.622e+01 2.084e+01 -0.778 0.436308
## PACKAGE12SMALL 8ONE BUMPY CUP 7.122e-01 1.845e+01 0.039 0.969204
## PACKAGE12SMALL 8ONE CUP 1.194e+02 3.092e+01 3.860 0.000114 ***
## PACKAGE12SMALL MLT MEDIUM CUP -3.776e+01 4.514e+01 -0.836 0.402961
## PACKAGE12SMALL MLT PLASTICS JUG -4.270e+01 9.496e+00 -4.497 6.91e-06 ***
## PACKAGE12SMALL MULTI CUP 3.034e+01 5.027e+01 0.604 0.546130
## PACKAGE16SMALL 24ONE CUP -1.069e+02 1.090e+02 -0.981 0.326806
## PACKAGE16SMALL MLT SHADYES JUG -3.776e+01 1.539e+02 -0.245 0.806142
## PACKAGE16SMALL MULTI CUP -9.307e+01 8.855e+00 -10.511 < 2e-16 ***
## PACKAGE18SMALL MULTI JUG -4.791e+01 5.392e+00 -8.886 < 2e-16 ***
## PACKAGE1L MULTI JUG -3.605e+01 1.345e+01 -2.680 0.007370 **
## PACKAGE20SMALL MULTI JUG -2.529e+01 8.642e+00 -2.926 0.003435 **
## PACKAGE24 - 25SMALL MULTI JUG -5.608e+01 6.753e+00 -8.303 < 2e-16 ***
## PACKAGE24SMALL MLT SHADYES JUG -3.743e+01 1.996e+01 -1.875 0.060791 .
## PACKAGE2L MULTI JUG -7.211e+01 8.673e+00 -8.314 < 2e-16 ***
## PACKAGE3L MULTI JUG -3.096e+01 5.497e+01 -0.563 0.573240
## PACKAGE7.5SMALL 6ONE CUP -3.071e+01 4.344e+01 -0.707 0.479554
## PACKAGEALL OTHER ONES 2.381e+01 3.090e+01 0.771 0.440973
## CATEGORYING ENHANCED WATER -5.774e+01 3.507e+01 -1.646 0.099705 .
## CATEGORYSPARKLING WATER -1.305e+02 1.785e+01 -7.311 2.70e-13 ***
## CATEGORYSSD -7.457e+01 3.518e+00 -21.200 < 2e-16 ***
## SEASONSPRING 1.251e+00 2.133e+00 0.586 0.557672
## SEASONSUMMER 6.951e+00 2.104e+00 3.303 0.000958 ***
## SEASONWINTER -3.287e+00 2.186e+00 -1.504 0.132585
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 153.6 on 42387 degrees of freedom
## Multiple R-squared: 0.8568, Adjusted R-squared: 0.8566
## F-statistic: 7922 on 32 and 42387 DF, p-value: < 2.2e-16
Our Casava small has a lower R2 of 0.85, but also contains much more data with nearly 42K observations compared to our innovation df at about 5 observations. There are many signficant features, but nothing that swings the needle in huge ways.
Reworking the subset casava for more feature engineering.
casava_small <- casava_small %>%
mutate(
PACKAGE2 = str_extract(ITEM, "(CUP|JUG).*"), # Extracts the part from CUP or JUG to the end.
ITEM = str_replace(ITEM, "(CUP|JUG).*", "") # Replaces the CUP/JUG and everything after it with empty string in ITEM.
)
#casava_small
casava_small <- casava_small %>%
mutate(
TEMP = str_extract(ITEM, "\\d+\\.?\\d*.*"), # Extracts the part from the first number to the end.
PACKAGE2 = if_else(is.na(PACKAGE2), TEMP, paste(PACKAGE2, TEMP)), # Combines existing PACKAGE2 with new extraction if needed.
ITEM = str_replace(ITEM, "\\d+\\.?\\d*.*", ""), # Removes the numeric part and everything after it from ITEM.
TEMP = NULL # Removes the temporary column.
)
#casava_small
na_rows <- casava_small %>%
filter(is.na(PACKAGE2))
#na_rows
#the above steps excised all packaging out of ITEM column
casava_small <- casava_small %>%
mutate(
GENTLE_DRINK = if_else(str_detect(ITEM, "GENTLE DRINK"), 1, 0), # Assigns 1 if "GENTLE DRINK" exists, otherwise 0.
ITEM = str_replace(ITEM, "GENTLE DRINK", "") # Removes "GENTLE DRINK" from ITEM.
)
#casava_small
casava_small <- casava_small %>%
mutate(
ENERGY_DRINK = if_else(str_detect(ITEM, "ENERGY DRINK"), 1, 0), # Assigns 1 if "ENERGY DRINK" exists, otherwise 0.
ITEM = str_replace(ITEM, "ENERGY DRINK", "") # Removes "ENERGY DRINK" from ITEM.
)
#casava_small
library(dplyr)
library(stringr)
# Define the pattern as a regular expression
pattern <- "ZERO CALORIES|ZERO CALORIE|ZERO SUGAR|SUGAR FREE|NO CALORIES"
casava_small <- casava_small %>%
mutate(
CALORIC_SEGMENT_TEXT = str_extract(ITEM, pattern), # Extracts matching text based on the pattern.
ITEM = str_replace_all(ITEM, pattern, "") # Removes extracted text from ITEM.
)
#casava_small
library(dplyr)
library(stringr)
casava_small <- casava_small %>%
mutate(
CALORIC_SEGMENT_TEXT = if_else(str_detect(ITEM, "\\bDIET\\b"),
if_else(is.na(CALORIC_SEGMENT_TEXT), "DIET", paste(CALORIC_SEGMENT_TEXT, "DIET", sep=", ")),
CALORIC_SEGMENT_TEXT)
)
#casava_small
# Function to remove the second instance of any repeating word
remove_second_instance <- function(item) {
words <- unlist(str_split(item, "\\s+")) # Split item into words
unique_words <- unique(words) # Get unique words to check for repeats
for (word in unique_words) {
word_indices <- which(words == word) # Find all indices of the current word
if (length(word_indices) > 1) { # If there is more than one occurrence
words[word_indices[2]] <- "" # Remove the second occurrence
}
}
return(paste(words, collapse = " ")) # Reconstruct sentence without the second instance
}
# Apply the function to the 'ITEM' column
casava_small <- casava_small %>%
mutate(ITEM = sapply(ITEM, remove_second_instance))
# Remove specific columns
casava_small <- select(casava_small, -PACKAGE2, -GENTLE_DRINK, -ENERGY_DRINK, -CALORIC_SEGMENT_TEXT)
head(casava_small)
## MARKET_KEY DATE CALORIC_SEGMENT CATEGORY UNIT_SALES
## 18128439 784 2022-12-24 0 SPARKLING WATER 20
## 13183569 59 2022-07-23 1 ENERGY 1
## 9036653 441 2022-05-21 0 SPARKLING WATER 95
## 21010102 893 2021-05-08 1 SSD 9
## 23033055 965 2021-07-31 1 SSD 32
## 20054930 87 2021-05-15 0 SPARKLING WATER 20
## DOLLAR_SALES MANUFACTURER BRAND PACKAGE
## 18128439 89.53 JOLLYS BUBBLE JOY 12SMALL 8ONE CUP
## 13183569 2.49 JOLLYS SUPER-DUPER RECOVERY 16SMALL MULTI CUP
## 9036653 345.41 JOLLYS BUBBLE JOY 12SMALL 8ONE CUP
## 21010102 9.00 JOLLYS HILL MOISTURE 16SMALL MULTI CUP
## 23033055 107.31 COCOS FANTASMIC .5L 6ONE JUG
## 20054930 31.70 SWIRE-CC GREETINGLE BUBBLES ALL OTHER ONES
## ITEM
## 18128439 BUBBLE JOY SPARKLING WATER JACK CASAVA
## 13183569 SUPER-DUPER RECOVERY CASAVA JACK
## 9036653 BUBBLE JOY SPARKLING WATER JACK CASAVA
## 21010102 RAINING MAUI BURST CASAVA
## 23033055 FANTASMIC CASAVA
## 20054930 GREETINGLE BUBBLES HEALTH BEVERAGE PERU CASAVA NO ARTIFICIAL SWEETENERS
## MONTH SEASON REGION
## 18128439 12 WINTER NOCAL
## 13183569 7 SUMMER CALI_NEVADA
## 9036653 5 SPRING MOUNTAIN
## 21010102 5 SPRING ARIZONA
## 23033055 7 SUMMER COLORADO
## 20054930 5 SPRING MOUNTAIN
DIET MOONLIT has pretty decent sales at 69th place in total revenue. Casava is not the sexiest flavor in town, but with our innovation dataframe the R2 is quite high (although it is based on regular and no specific package type). There are some weeks that look great for 6 month predictions, it’s just a matter of deciding which ones to use.
Data Prep and XgBoost Model for best weeks()
df <- read_csv("swire_no_nas_w_pop.csv") #inject the data and we will sub-sample
## Rows: 24461424 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): CALORIC_SEGMENT, CATEGORY, MANUFACTURER, BRAND, PACKAGE, ITEM
## dbl (4): MARKET_KEY, UNIT_SALES, DOLLAR_SALES, POP_SQMI
## date (1): DATE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# #print unique package where BRAND is DIET MOONLIT
# unique(df$PACKAGE[df$BRAND == "DIET MOONLIT"])
#
# #print count of CATEGORY = ENERGY with 2L in PACKAGE or ITEM
# table(df$CATEGORY[df$PACKAGE == "2L MULTI JUG" | df$ITEM == "2L MULTI JUG" | df$PACKAGE == "2L MULTI JUG" | df$ITEM == "2L MULTI JUG"])
#sales of BRAND=="MOONLIT"at Max date
df %>%
filter(BRAND == "MOONLIT") %>%
arrange(desc(DATE)) %>%
summarise(UNIT_SALES = sum(UNIT_SALES)) %>%
head(1)
## # A tibble: 1 × 1
## UNIT_SALES
## <dbl>
## 1 26837843.
df %>%
filter(BRAND == "MOONLIT") %>%
arrange((DATE)) %>%
group_by(DATE) %>%
summarize(sum(UNIT_SALES)) %>%
head(1)
## # A tibble: 1 × 2
## DATE `sum(UNIT_SALES)`
## <date> <dbl>
## 1 2020-12-05 172170
#Group by ITEM with DATE Before 2021-01-01, drop those ITEM rows
df_long_running <- df %>%
group_by(ITEM) %>%
filter(DATE <= "2021-01-01")
#remove all rows in casava_long_running from casava
df <- df %>%
anti_join(df_long_running)
## Joining with `by = join_by(MARKET_KEY, DATE, CALORIC_SEGMENT, CATEGORY,
## UNIT_SALES, DOLLAR_SALES, MANUFACTURER, BRAND, PACKAGE, ITEM, POP_SQMI)`
#Group by ITEM rows with less than 20 weeks of data
df_small <- df %>%
group_by(ITEM) %>%
filter(n() <= 20)
#remove all rows in casava_long_running from casava
df <- df %>%
anti_join(df_small)
## Joining with `by = join_by(MARKET_KEY, DATE, CALORIC_SEGMENT, CATEGORY,
## UNIT_SALES, DOLLAR_SALES, MANUFACTURER, BRAND, PACKAGE, ITEM, POP_SQMI)`
#Drop rows after May 21st 2023 as there are several gaps for most brands in innovation casava
df <- df %>%
filter(DATE <= "2023-05-21")
#cleanup everything but df
rm(df_long_running, df_small)
#skim(df)
# Assuming df is your dataframe
# set.seed(123) # Set a random seed for reproducibility
# sampled_df <- df[sample(1:nrow(df), 2446143), ]
# rm(df)
# df <- sampled_df
# rm(sampled_df)
#skim(df)
#summary(df)
#casava CASAVA and MOONLIT (regular)
casava <- df %>%
# filter(PACKAGE == "2L MULTI JUG",
filter(
str_detect(ITEM, "CASAVA"),
BRAND == "MOONLIT")
#skim(casava)
#DIET MOONLIT where package is either 2L MULTI JUG or 12SMALL MULTI CUP or 16SMALL 24ONE CUP
diet_moonlit <- df %>%
filter(PACKAGE %in% c("2L MULTI JUG", "12SMALL", "16SMALL") &
BRAND == "DIET MOONLIT")
#skim(diet_moonlit)
#CASAVA and ENERGY
energy <- df %>%
filter(CATEGORY == "ENERGY",
str_detect(ITEM, "CASAVA")
)
# #Diet Energy - too much noise
diet_energy <- df %>%
filter(CATEGORY == "ENERGY",
CALORIC_SEGMENT == 0)
# Merge the data frames
merged_innovation_df <- bind_rows(casava, diet_moonlit, energy, diet_energy)
#merged_innovation_df <- bind_rows(casava, diet_moonlit, energy)
#remove duplicate rows
df <- merged_innovation_df %>% distinct()
skim(df)
| Name | df |
| Number of rows | 72504 |
| Number of columns | 11 |
| _______________________ | |
| Column type frequency: | |
| character | 6 |
| Date | 1 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| CALORIC_SEGMENT | 0 | 1 | 7 | 10 | 0 | 2 | 0 |
| CATEGORY | 0 | 1 | 3 | 6 | 0 | 2 | 0 |
| MANUFACTURER | 0 | 1 | 5 | 8 | 0 | 3 | 0 |
| BRAND | 0 | 1 | 7 | 25 | 0 | 7 | 0 |
| PACKAGE | 0 | 1 | 12 | 17 | 0 | 5 | 0 |
| ITEM | 0 | 1 | 48 | 87 | 0 | 9 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| DATE | 0 | 1 | 2021-01-02 | 2023-05-20 | 2022-07-02 | 125 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| MARKET_KEY | 0 | 1 | 581.92 | 587.35 | 1.00 | 260.00 | 536.00 | 831.00 | 6802.00 | ▇▁▁▁▁ |
| UNIT_SALES | 0 | 1 | 90.60 | 239.33 | 1.00 | 17.00 | 46.00 | 99.00 | 5349.00 | ▇▁▁▁▁ |
| DOLLAR_SALES | 0 | 1 | 179.87 | 462.39 | 0.25 | 30.64 | 82.84 | 186.83 | 11270.01 | ▇▁▁▁▁ |
| POP_SQMI | 0 | 1 | 1383.59 | 1772.35 | 0.18 | 35.12 | 349.46 | 2474.41 | 6769.35 | ▇▂▂▁▁ |
regions_joinme <- read.csv("states_summary.csv")
unique(regions_joinme$REGION)
## [1] "NORTHERN" "DESERT_SW" "PRAIRIE" "CALI_NEVADA" "MOUNTAIN"
## [6] "SOCAL" "ARIZONA" "NEWMEXICO" "NOCAL" "COLORADO"
## [11] "KANSAS"
# "NORTHERN" "DESERT_SW" "PRAIRIE" "CALI_NEVADA" "MOUNTAIN" "SOCAL" "ARIZONA" "NEWMEXICO" "NOCAL" "COLORADO" "KANSAS"
str(regions_joinme)
## 'data.frame': 200 obs. of 2 variables:
## $ MARKET_KEY: int 13 70 179 197 272 352 32 33 44 50 ...
## $ REGION : chr "NORTHERN" "NORTHERN" "DESERT_SW" "DESERT_SW" ...
# Perform a left join using the merge() function
df <- merge(df, regions_joinme[, c("MARKET_KEY", "REGION")], by = "MARKET_KEY", all.x = TRUE)
rm(regions_joinme)
# Update CALORIC_SEGMENT values: 0 if 'DIET/LIGHT', otherwise 1
df$CALORIC_SEGMENT <- ifelse(df$CALORIC_SEGMENT == "DIET/LIGHT", 0, 1)
df$MARKET_KEY <- as.character(df$MARKET_KEY)
df <- df %>%
mutate(
MONTH = as.numeric(substr(DATE, 6, 7)), # Extract the month from YYYY-MM-DD format
SEASON = case_when(
MONTH %in% c(12, 01, 02) ~ "WINTER",
MONTH %in% c(03, 04, 05) ~ "SPRING",
MONTH %in% c(06, 07, 08) ~ "SUMMER",
MONTH %in% c(09, 10, 11) ~ "FALL",
TRUE ~ NA_character_ # This is just in case there are any undefined values
)
)
#save merged_innovation_df back to casava
casava <- df
skim(casava)
| Name | casava |
| Number of rows | 72504 |
| Number of columns | 14 |
| _______________________ | |
| Column type frequency: | |
| character | 8 |
| Date | 1 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| MARKET_KEY | 0 | 1 | 1 | 4 | 0 | 200 | 0 |
| CATEGORY | 0 | 1 | 3 | 6 | 0 | 2 | 0 |
| MANUFACTURER | 0 | 1 | 5 | 8 | 0 | 3 | 0 |
| BRAND | 0 | 1 | 7 | 25 | 0 | 7 | 0 |
| PACKAGE | 0 | 1 | 12 | 17 | 0 | 5 | 0 |
| ITEM | 0 | 1 | 48 | 87 | 0 | 9 | 0 |
| REGION | 0 | 1 | 5 | 11 | 0 | 11 | 0 |
| SEASON | 0 | 1 | 4 | 6 | 0 | 4 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| DATE | 0 | 1 | 2021-01-02 | 2023-05-20 | 2022-07-02 | 125 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| CALORIC_SEGMENT | 0 | 1 | 0.67 | 0.47 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▃▁▁▁▇ |
| UNIT_SALES | 0 | 1 | 90.60 | 239.33 | 1.00 | 17.00 | 46.00 | 99.00 | 5349.00 | ▇▁▁▁▁ |
| DOLLAR_SALES | 0 | 1 | 179.87 | 462.39 | 0.25 | 30.64 | 82.84 | 186.83 | 11270.01 | ▇▁▁▁▁ |
| POP_SQMI | 0 | 1 | 1383.59 | 1772.35 | 0.18 | 35.12 | 349.46 | 2474.41 | 6769.35 | ▇▂▂▁▁ |
| MONTH | 0 | 1 | 6.37 | 3.53 | 1.00 | 3.00 | 6.00 | 10.00 | 12.00 | ▇▆▅▅▇ |
Reworking the subset Casava for more feature engineering.
casava <- casava %>%
mutate(
PACKAGE2 = str_extract(ITEM, "(CUP|JUG).*"), # Extracts the part from CUP or JUG to the end.
ITEM = str_replace(ITEM, "(CUP|JUG).*", "") # Replaces the CUP/JUG and everything after it with empty string in ITEM.
)
casava <- casava %>%
mutate(
TEMP = str_extract(ITEM, "\\d+\\.?\\d*.*"), # Extracts the part from the first number to the end.
PACKAGE2 = if_else(is.na(PACKAGE2), TEMP, paste(PACKAGE2, TEMP)), # Combines existing PACKAGE2 with new extraction if needed.
ITEM = str_replace(ITEM, "\\d+\\.?\\d*.*", ""), # Removes the numeric part and everything after it from ITEM.
TEMP = NULL # Removes the temporary column.
)
na_rows <- casava %>%
filter(is.na(PACKAGE2))
na_rows
## [1] MARKET_KEY DATE CALORIC_SEGMENT CATEGORY
## [5] UNIT_SALES DOLLAR_SALES MANUFACTURER BRAND
## [9] PACKAGE ITEM POP_SQMI REGION
## [13] MONTH SEASON PACKAGE2
## <0 rows> (or 0-length row.names)
#the above steps excised all packaging out of ITEM column
# Function to remove the second instance of any repeating word
remove_second_instance <- function(item) {
words <- unlist(str_split(item, "\\s+")) # Split item into words
unique_words <- unique(words) # Get unique words to check for repeats
for (word in unique_words) {
word_indices <- which(words == word) # Find all indices of the current word
if (length(word_indices) > 1) { # If there is more than one occurrence
words[word_indices[2]] <- "" # Remove the second occurrence
}
}
return(paste(words, collapse = " ")) # Reconstruct sentence without the second instance
}
# Apply the function to the 'ITEM' column
casava <- casava %>%
mutate(ITEM = sapply(ITEM, remove_second_instance))
# #One hot encode either "ENERGY" or "ED" in ITEM as an ENERGY_DRINK
# casava$ENERGY_DRINK <- ifelse(str_detect(casava$ITEM, "ENERGY|' ED'"), 1, 0)
#
# casava$ITEM <- str_replace(casava$ITEM, "ENERGY DRINK", "")
# casava$ITEM <- str_replace(casava$ITEM, "ENERGY", "")
# casava$ITEM <- str_replace(casava$ITEM, " ED", "")
# table(casava$ENERGY_DRINK)
#
# table(casava$CATEGORY)
#
# casava %>%
# filter(ENERGY_DRINK == 1,
# CATEGORY=='SSD') %>%
# select(ITEM) %>%
# head(10)
# Remove specific columns
#casava <- select(casava, -PACKAGE2, -CATEGORY)
head(casava)
## MARKET_KEY DATE CALORIC_SEGMENT CATEGORY UNIT_SALES DOLLAR_SALES
## 1 1 2022-09-17 1 ENERGY 21 38.51
## 2 1 2021-09-18 1 ENERGY 27 45.03
## 3 1 2022-11-05 1 ENERGY 33 91.62
## 4 1 2023-04-29 1 ENERGY 54 96.99
## 5 1 2023-04-01 1 ENERGY 23 56.79
## 6 1 2023-04-29 1 ENERGY 24 53.46
## MANUFACTURER BRAND PACKAGE
## 1 JOLLYS SUPER-DUPER JUICED 16SMALL MULTI CUP
## 2 JOLLYS SUPER-DUPER JUICED 16SMALL MULTI CUP
## 3 PONYS MYTHICAL BEVERAGE 16SMALL MULTI CUP
## 4 JOLLYS SUPER-DUPER RECOVERY 16SMALL MULTI CUP
## 5 PONYS MYTHICAL BEVERAGE 16SMALL MULTI CUP
## 6 PONYS MYTHICAL BEVERAGE 16SMALL MULTI CUP
## ITEM POP_SQMI REGION MONTH
## 1 SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA 1.201114 NORTHERN 9
## 2 SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA 1.201114 NORTHERN 9
## 3 MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA 1.201114 NORTHERN 11
## 4 SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK 1.201114 NORTHERN 4
## 5 MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA 1.201114 NORTHERN 4
## 6 MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA 1.201114 NORTHERN 4
## SEASON PACKAGE2
## 1 FALL CUP 16 LIQUID SMALL NA
## 2 FALL CUP 16 LIQUID SMALL NA
## 3 FALL CUP 16 LIQUID SMALL NA
## 4 SPRING CUP 16 LIQUID SMALL NA
## 5 SPRING CUP 16 LIQUID SMALL NA
## 6 SPRING CUP 16 LIQUID SMALL NA
table(casava$ITEM)
##
## JUMPIN-FISH ENERGY DRINK CASAVA JACK
## 8
## MOONLIT GENTLE DRINK CASAVA
## 1540
## MOONLIT GENTLE DRINK SUNSET
## 23881
## MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA
## 17089
## SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA
## 16930
## SUPER-DUPER PITAYA ED ENERGY DRINK CASAVA NO ARTIFICIAL SWEETENERS
## 4774
## SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK
## 8282
#write.csv(casava_small, "casava_yellow.csv", row.names = FALSE)
#write.csv(diet_moonlit_df, "diet_moonlit.csv", row.names = FALSE)
#Trim trailing white space at end of ITEM
casava$ITEM <- str_trim(casava$ITEM, side = "right")
# #replace "GENTLE DRINK" with "" in ITEM
# casava$ITEM <- str_replace(casava$ITEM, "GENTLE DRINK", "")
#One hot encode "NO ARTIFICAL SWEETNERS" in ITEM
# casava$NO_ARTIFICIAL_SWEETNERS <- ifelse(str_detect(casava$ITEM,
# "NO ARTIFICIAL SWEETENERS"),
# 1, 0)
#
# table(casava$NO_ARTIFICIAL_SWEETNERS)
# # #Remove "NO ARTIFICIAL SWEETNERS" from ITEM
# casava$ITEM <- str_replace(casava$ITEM, "NO ARTIFICIAL SWEETENERS", "")
#Remove Health Supplement rows
# casava <- casava %>%
# filter(!str_detect(ITEM, "HEALTH SUPPLEMENT"))
# pattern <- "ZERO CALORIES|ZERO CALORIE|ZERO SUGAR|SUGAR FREE|NO CALORIES|ZERO CARB|PURE ZERO|DIET"
#
# casava <- casava %>%
# mutate(
# CALORIC_SEGMENT_TEXT = str_extract(ITEM, pattern), # Extracts matching text based on the pattern.
# ITEM = str_replace_all(ITEM, pattern, "") # Removes extracted text from ITEM.
# )
#remove mythical beverage - monster reserve casava (pineapple)
#casava <- casava %>%
# filter(!str_detect(ITEM, "MYTHICAL BEVERAGE"))
# Remove JUMPING JACK - sporadic single week
casava <- casava %>%
filter(!str_detect(ITEM, "JUMPIN-FISH CASAVA JACK"))
#Remove "SUPER-DUPER CASAVA YELLOW"
# casava <- casava %>%
# filter(!str_detect(ITEM, "SUPER-DUPER PURE CASAVA YELLOW"))
#drop row with MOONLIT SUNSET in ITEM
casava <- casava %>%
filter(!str_detect(ITEM, "MOONLIT SUNSET"))
print(unique(casava$ITEM))
## [1] "SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA"
## [2] "MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA"
## [3] "SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK"
## [4] "SUPER-DUPER PITAYA ED ENERGY DRINK CASAVA NO ARTIFICIAL SWEETENERS"
## [5] "MOONLIT GENTLE DRINK SUNSET"
## [6] "MOONLIT GENTLE DRINK CASAVA"
## [7] "JUMPIN-FISH ENERGY DRINK CASAVA JACK"
# Find the minimum launch date for each product
min_launch_dates <- casava %>%
group_by(ITEM) %>%
summarise(min_launch_date = min(DATE))
# Join the minimum launch dates back to the original data
casava <- casava %>%
left_join(min_launch_dates, by = "ITEM")
# Calculate the number of weeks since the product launch
casava <- casava %>%
mutate(WEEKS_SINCE_LAUNCH = as.numeric(difftime(DATE, min_launch_date, units = "weeks")))
# Selecting required columns and printing the first 10 rows
casava %>%
filter(UNIT_SALES > 0) %>%
select(DATE, ITEM, WEEKS_SINCE_LAUNCH) %>%
head(10)
## DATE ITEM
## 1 2022-09-17 SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA
## 2 2021-09-18 SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA
## 3 2022-11-05 MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA
## 4 2023-04-29 SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK
## 5 2023-04-01 MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA
## 6 2023-04-29 MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA
## 7 2022-05-14 MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA
## 8 2022-05-07 SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA
## 9 2022-09-03 MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA
## 10 2022-10-29 SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA
## WEEKS_SINCE_LAUNCH
## 1 89
## 2 37
## 3 60
## 4 45
## 5 81
## 6 85
## 7 35
## 8 70
## 9 51
## 10 95
#Subtract before April 19th from WEEKS_SINCE_LAUNCH where ITEM === "SUPER-DUPER JUICED CASAVA SUNSET GUAVA" actually launches and takes off, replace anything less than 0 with 0.
#Remove rows from ITEM == "SUPER-DUPER JUICED CASAVA SUNSET GUAVA" before April 19th 2021 when something about it actually launches
casava <- casava %>%
filter(!(ITEM == "SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA" & DATE < as.Date("2021-04-19")))
#Subtract WEEKS SINCE LAUNCH for ITEM == "SUPER-DUPER JUICED CASAVA SUNSET GUAVA" by 16 weeks, so real launch date matches up.
casava <- casava %>%
mutate(WEEKS_SINCE_LAUNCH = ifelse(ITEM == "SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA", WEEKS_SINCE_LAUNCH - 16, WEEKS_SINCE_LAUNCH))
#Set any negative WEEKS_SINCE_LAUNCH to 0
casava <- casava %>%
mutate(WEEKS_SINCE_LAUNCH = ifelse(WEEKS_SINCE_LAUNCH < 0, 0, WEEKS_SINCE_LAUNCH))
#min date of ITEM "SUPER-DUPER JUICED CASAVA SUNSET GUAVA"
casava %>%
filter(ITEM == "SUPER-DUPER JUICED CASAVA SUNSET GUAVA") %>%
summarise(min(DATE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `min(DATE)`.
## Caused by warning in `min.default()`:
## ! no non-missing arguments to min; returning Inf
## min(DATE)
## 1 Inf
print(unique(casava$ITEM))
## [1] "SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA"
## [2] "MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA"
## [3] "SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK"
## [4] "SUPER-DUPER PITAYA ED ENERGY DRINK CASAVA NO ARTIFICIAL SWEETENERS"
## [5] "MOONLIT GENTLE DRINK SUNSET"
## [6] "MOONLIT GENTLE DRINK CASAVA"
## [7] "JUMPIN-FISH ENERGY DRINK CASAVA JACK"
print(unique(casava$BRAND))
## [1] "SUPER-DUPER JUICED" "MYTHICAL BEVERAGE"
## [3] "SUPER-DUPER RECOVERY" "SUPER-DUPER PUNCHED"
## [5] "DIET MOONLIT" "MOONLIT"
## [7] "HILL MOISTURE JUMPIN-FISH"
print(unique(casava$CATEGORY))
## [1] "ENERGY" "SSD"
print(unique(casava$PACKAGE))
## [1] "16SMALL MULTI CUP" "2L MULTI JUG" "20SMALL MULTI JUG"
## [4] "16SMALL 24ONE CUP" "12SMALL MULTI CUP"
#print(unique(casava$CALORIC_SEGMENT_TEXT))
print(unique(casava$CALORIC_SEGMENT))
## [1] 1 0
#What percent of UNIT SALE are "2L MULTI JUG"
casava %>%
filter(PACKAGE == "2L MULTI JUG") %>%
summarise(UNIT_SALES = sum(UNIT_SALES)) %>%
mutate(PERCENTAGE = UNIT_SALES / sum(casava$UNIT_SALES) * 100)
## UNIT_SALES PERCENTAGE
## 1 2174965 33.10922
#What percent of UNIT SALE are "DIET"
casava %>%
filter(CALORIC_SEGMENT == "DIET") %>%
summarise(UNIT_SALES = sum(UNIT_SALES)) %>%
mutate(PERCENTAGE = UNIT_SALES / sum(casava$UNIT_SALES) * 100)
## UNIT_SALES PERCENTAGE
## 1 0 0
#Test removing ITEMS containing MOONLIT CASAVA and MYSTICAL BEVERAGE
# unique(casava$BRAND)
# casava <- casava %>%
# filter(!BRAND=="MOONLIT",
# !BRAND=="MYTHICAL BEVERAGE")
# unique(casava$BRAND)
#
write_csv(casava, "casava_tableau.csv")
str(casava)
## 'data.frame': 72455 obs. of 17 variables:
## $ MARKET_KEY : chr "1" "1" "1" "1" ...
## $ DATE : Date, format: "2022-09-17" "2021-09-18" ...
## $ CALORIC_SEGMENT : num 1 1 1 1 1 1 1 1 1 1 ...
## $ CATEGORY : chr "ENERGY" "ENERGY" "ENERGY" "ENERGY" ...
## $ UNIT_SALES : num 21 27 33 54 23 24 40 46 44 20 ...
## $ DOLLAR_SALES : num 38.5 45 91.6 97 56.8 ...
## $ MANUFACTURER : chr "JOLLYS" "JOLLYS" "PONYS" "JOLLYS" ...
## $ BRAND : chr "SUPER-DUPER JUICED" "SUPER-DUPER JUICED" "MYTHICAL BEVERAGE" "SUPER-DUPER RECOVERY" ...
## $ PACKAGE : chr "16SMALL MULTI CUP" "16SMALL MULTI CUP" "16SMALL MULTI CUP" "16SMALL MULTI CUP" ...
## $ ITEM : chr "SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA" "SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA" "MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA" "SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK" ...
## $ POP_SQMI : num 1.2 1.2 1.2 1.2 1.2 ...
## $ REGION : chr "NORTHERN" "NORTHERN" "NORTHERN" "NORTHERN" ...
## $ MONTH : num 9 9 11 4 4 4 5 5 9 10 ...
## $ SEASON : chr "FALL" "FALL" "FALL" "SPRING" ...
## $ PACKAGE2 : chr "CUP 16 LIQUID SMALL NA" "CUP 16 LIQUID SMALL NA" "CUP 16 LIQUID SMALL NA" "CUP 16 LIQUID SMALL NA" ...
## $ min_launch_date : Date, format: "2021-01-02" "2021-01-02" ...
## $ WEEKS_SINCE_LAUNCH: num 73 21 60 45 81 85 35 54 51 79 ...
#remove all objects other than casava
rm(list = setdiff(ls(), "casava"))
print(unique(casava$ITEM))
## [1] "SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA"
## [2] "MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA"
## [3] "SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK"
## [4] "SUPER-DUPER PITAYA ED ENERGY DRINK CASAVA NO ARTIFICIAL SWEETENERS"
## [5] "MOONLIT GENTLE DRINK SUNSET"
## [6] "MOONLIT GENTLE DRINK CASAVA"
## [7] "JUMPIN-FISH ENERGY DRINK CASAVA JACK"
print(unique(casava$BRAND))
## [1] "SUPER-DUPER JUICED" "MYTHICAL BEVERAGE"
## [3] "SUPER-DUPER RECOVERY" "SUPER-DUPER PUNCHED"
## [5] "DIET MOONLIT" "MOONLIT"
## [7] "HILL MOISTURE JUMPIN-FISH"
print(unique(casava$CATEGORY))
## [1] "ENERGY" "SSD"
print(unique(casava$PACKAGE))
## [1] "16SMALL MULTI CUP" "2L MULTI JUG" "20SMALL MULTI JUG"
## [4] "16SMALL 24ONE CUP" "12SMALL MULTI CUP"
#what caloric segment is ITEM "MOONLIT SUNSET"
casava %>%
filter(ITEM == "MOONLIT SUNSET")
## [1] MARKET_KEY DATE CALORIC_SEGMENT CATEGORY
## [5] UNIT_SALES DOLLAR_SALES MANUFACTURER BRAND
## [9] PACKAGE ITEM POP_SQMI REGION
## [13] MONTH SEASON PACKAGE2 min_launch_date
## [17] WEEKS_SINCE_LAUNCH
## <0 rows> (or 0-length row.names)
# Creating an 'innovation' data frame
#factor all character variables
casava$REGION <- as.factor(casava$REGION)
#casava$CATEGORY <- as.factor(casava$CATEGORY)
#casava$BRAND <- as.factor(casava$BRAND)
casava$SEASON <- as.factor(casava$SEASON)
casava$PACKAGE2 <- as.factor(casava$PACKAGE2)
#model <- lm(DOLLAR_SALES ~ UNIT_SALES + CALORIC_SEGMENT + POP_SQMI + REGION + CATEGORY + MONTH + SEASON + PACKAGE2 + WEEKS_SINCE_LAUNCH, data = casava)
# model <- lm(DOLLAR_SALES ~ UNIT_SALES + POP_SQMI + REGION + MONTH + SEASON + PACKAGE2 + WEEKS_SINCE_LAUNCH, data = casava)
# summary(model)
# Creating an 'innovation' data frame
#model <- lm(UNIT_SALES ~ DOLLAR_SALES + CALORIC_SEGMENT + PACKAGE + POP_SQMI + REGION + CATEGORY + MONTH + SEASON + PACKAGE2 + WEEKS_SINCE_LAUNCH, data = casava)
# model <- lm(UNIT_SALES ~ DOLLAR_SALES + PACKAGE + POP_SQMI + REGION + MONTH + SEASON + PACKAGE2 + WEEKS_SINCE_LAUNCH, data = casava)
# summary(model)
# Creating an 'innovation' data frame
# #model <- lm(UNIT_SALES ~ CALORIC_SEGMENT + PACKAGE + POP_SQMI + REGION + CATEGORY + MONTH + SEASON + PACKAGE2 + WEEKS_SINCE_LAUNCH, data = casava)
# model <- lm(UNIT_SALES ~ + PACKAGE + POP_SQMI + REGION + MONTH + SEASON + PACKAGE2 + WEEKS_SINCE_LAUNCH, data = casava)
# summary(model)
# Load and prepare dataset
df <- read.csv("casava_tableau.csv")
# Load and prepare dataset
str(df)
## 'data.frame': 72455 obs. of 17 variables:
## $ MARKET_KEY : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DATE : chr "2022-09-17" "2021-09-18" "2022-11-05" "2023-04-29" ...
## $ CALORIC_SEGMENT : int 1 1 1 1 1 1 1 1 1 1 ...
## $ CATEGORY : chr "ENERGY" "ENERGY" "ENERGY" "ENERGY" ...
## $ UNIT_SALES : int 21 27 33 54 23 24 40 46 44 20 ...
## $ DOLLAR_SALES : num 38.5 45 91.6 97 56.8 ...
## $ MANUFACTURER : chr "JOLLYS" "JOLLYS" "PONYS" "JOLLYS" ...
## $ BRAND : chr "SUPER-DUPER JUICED" "SUPER-DUPER JUICED" "MYTHICAL BEVERAGE" "SUPER-DUPER RECOVERY" ...
## $ PACKAGE : chr "16SMALL MULTI CUP" "16SMALL MULTI CUP" "16SMALL MULTI CUP" "16SMALL MULTI CUP" ...
## $ ITEM : chr "SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA" "SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA" "MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA" "SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK" ...
## $ POP_SQMI : num 1.2 1.2 1.2 1.2 1.2 ...
## $ REGION : chr "NORTHERN" "NORTHERN" "NORTHERN" "NORTHERN" ...
## $ MONTH : int 9 9 11 4 4 4 5 5 9 10 ...
## $ SEASON : chr "FALL" "FALL" "FALL" "SPRING" ...
## $ PACKAGE2 : chr "CUP 16 LIQUID SMALL NA" "CUP 16 LIQUID SMALL NA" "CUP 16 LIQUID SMALL NA" "CUP 16 LIQUID SMALL NA" ...
## $ min_launch_date : chr "2021-01-02" "2021-01-02" "2021-09-11" "2022-06-18" ...
## $ WEEKS_SINCE_LAUNCH: int 73 21 60 45 81 85 35 54 51 79 ...
casava <- df %>%
#select(-DATE, -MONTH, -SEASON, -BRAND, -REGION, -ITEM )
#select(-MONTH, -SEASON, -min_launch_date, -PACKAGE2, -CALORIC_SEGMENT_TEXT)
select(-MONTH, -SEASON, -min_launch_date, -PACKAGE2)
# Assuming casava is your data frame and PACKAGING is the column of interest
# Create new columns in woodsy for each unique substring
# Each column will have a 1 if the substring is found in the PACKAGING column, 0 otherwise
casava$`16SMALL MULTI CUP` = as.integer(grepl("16SMALL MULTI CUP", casava$PACKAGE))
casava$`20SMALL MULTI JUG` = as.integer(grepl("20SMALL MULTI JUG", casava$PACKAGE))
casava$`16SMALL 24ONE CUP` = as.integer(grepl("16SMALL 24ONE CUP", casava$PACKAGE))
casava$`2L MULTI JUG` = as.integer(grepl("2L MULTI JUG", casava$PACKAGE))
#one hot encode non brand ITEM strings
casava$SUNSET = as.integer(grepl("SUNSET", casava$ITEM))
casava$BLAST = as.integer(grepl("BLAST", casava$ITEM))
casava$JUICED = as.integer(grepl("JUICED", casava$ITEM))
casava$GUAVA = as.integer(grepl("GUAVA", casava$ITEM))
casava$RECOVERY = as.integer(grepl("RECOVERY", casava$ITEM))
casava$JACK = as.integer(grepl("JACK", casava$ITEM))
casava$RESERVE= as.integer(grepl("RESERVE", casava$ITEM))
casava$WHITE= as.integer(grepl("WHITE", casava$ITEM))
casava$PITAYA= as.integer(grepl("PITAYA", casava$ITEM))
casava$ED= as.integer(grepl("ED", casava$ITEM))
casava$CASAVA= as.integer(grepl("CASAVA", casava$ITEM))
# Print the head of the data frame to see the first few rows
head(casava)
## MARKET_KEY DATE CALORIC_SEGMENT CATEGORY UNIT_SALES DOLLAR_SALES
## 1 1 2022-09-17 1 ENERGY 21 38.51
## 2 1 2021-09-18 1 ENERGY 27 45.03
## 3 1 2022-11-05 1 ENERGY 33 91.62
## 4 1 2023-04-29 1 ENERGY 54 96.99
## 5 1 2023-04-01 1 ENERGY 23 56.79
## 6 1 2023-04-29 1 ENERGY 24 53.46
## MANUFACTURER BRAND PACKAGE
## 1 JOLLYS SUPER-DUPER JUICED 16SMALL MULTI CUP
## 2 JOLLYS SUPER-DUPER JUICED 16SMALL MULTI CUP
## 3 PONYS MYTHICAL BEVERAGE 16SMALL MULTI CUP
## 4 JOLLYS SUPER-DUPER RECOVERY 16SMALL MULTI CUP
## 5 PONYS MYTHICAL BEVERAGE 16SMALL MULTI CUP
## 6 PONYS MYTHICAL BEVERAGE 16SMALL MULTI CUP
## ITEM POP_SQMI REGION
## 1 SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA 1.201114 NORTHERN
## 2 SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA 1.201114 NORTHERN
## 3 MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA 1.201114 NORTHERN
## 4 SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK 1.201114 NORTHERN
## 5 MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA 1.201114 NORTHERN
## 6 MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA 1.201114 NORTHERN
## WEEKS_SINCE_LAUNCH 16SMALL MULTI CUP 20SMALL MULTI JUG 16SMALL 24ONE CUP
## 1 73 1 0 0
## 2 21 1 0 0
## 3 60 1 0 0
## 4 45 1 0 0
## 5 81 1 0 0
## 6 85 1 0 0
## 2L MULTI JUG SUNSET BLAST JUICED GUAVA RECOVERY JACK RESERVE WHITE PITAYA ED
## 1 0 1 0 1 1 0 0 0 0 0 1
## 2 0 1 0 1 1 0 0 0 0 0 1
## 3 0 0 0 0 0 0 0 1 1 0 0
## 4 0 0 0 0 0 1 1 0 0 0 0
## 5 0 0 0 0 0 0 0 1 1 0 0
## 6 0 0 0 0 0 0 0 1 1 0 0
## CASAVA
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
casava$CATEGORY <- NULL
casava$MARKET_KEY <- NULL
casava$MANUFACTURER <- NULL
casava$PACKAGE <- NULL
library(fastDummies)
# One-hot encode the specified columns
casava <- fastDummies::dummy_cols(casava, select_columns = c("REGION", "ITEM"), remove_selected_columns = TRUE)
#casava <- fastDummies::dummy_cols(casava, select_columns = c("REGION", "SEASON","ITEM"), remove_selected_columns = TRUE)
# View the first few rows to verify the changes
head(casava)
## DATE CALORIC_SEGMENT UNIT_SALES DOLLAR_SALES BRAND
## 1 2022-09-17 1 21 38.51 SUPER-DUPER JUICED
## 2 2021-09-18 1 27 45.03 SUPER-DUPER JUICED
## 3 2022-11-05 1 33 91.62 MYTHICAL BEVERAGE
## 4 2023-04-29 1 54 96.99 SUPER-DUPER RECOVERY
## 5 2023-04-01 1 23 56.79 MYTHICAL BEVERAGE
## 6 2023-04-29 1 24 53.46 MYTHICAL BEVERAGE
## POP_SQMI WEEKS_SINCE_LAUNCH 16SMALL MULTI CUP 20SMALL MULTI JUG
## 1 1.201114 73 1 0
## 2 1.201114 21 1 0
## 3 1.201114 60 1 0
## 4 1.201114 45 1 0
## 5 1.201114 81 1 0
## 6 1.201114 85 1 0
## 16SMALL 24ONE CUP 2L MULTI JUG SUNSET BLAST JUICED GUAVA RECOVERY JACK
## 1 0 0 1 0 1 1 0 0
## 2 0 0 1 0 1 1 0 0
## 3 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 1 1
## 5 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0
## RESERVE WHITE PITAYA ED CASAVA REGION_ARIZONA REGION_CALI_NEVADA
## 1 0 0 0 1 1 0 0
## 2 0 0 0 1 1 0 0
## 3 1 1 0 0 1 0 0
## 4 0 0 0 0 1 0 0
## 5 1 1 0 0 1 0 0
## 6 1 1 0 0 1 0 0
## REGION_COLORADO REGION_DESERT_SW REGION_KANSAS REGION_MOUNTAIN
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## REGION_NEWMEXICO REGION_NOCAL REGION_NORTHERN REGION_PRAIRIE REGION_SOCAL
## 1 0 0 1 0 0
## 2 0 0 1 0 0
## 3 0 0 1 0 0
## 4 0 0 1 0 0
## 5 0 0 1 0 0
## 6 0 0 1 0 0
## ITEM_JUMPIN-FISH ENERGY DRINK CASAVA JACK ITEM_MOONLIT GENTLE DRINK CASAVA
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## ITEM_MOONLIT GENTLE DRINK SUNSET
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## ITEM_MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA
## 1 0
## 2 0
## 3 1
## 4 0
## 5 1
## 6 1
## ITEM_SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA
## 1 1
## 2 1
## 3 0
## 4 0
## 5 0
## 6 0
## ITEM_SUPER-DUPER PITAYA ED ENERGY DRINK CASAVA NO ARTIFICIAL SWEETENERS
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## ITEM_SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK
## 1 0
## 2 0
## 3 0
## 4 1
## 5 0
## 6 0
write.csv(casava, "casava_one_hot.csv", row.names = FALSE)
library(fastDummies)
# One-hot encode
casava <- fastDummies::dummy_cols(casava, select_columns = "BRAND", remove_selected_columns = TRUE)
# View the first few rows to verify
head(casava)
## DATE CALORIC_SEGMENT UNIT_SALES DOLLAR_SALES POP_SQMI
## 1 2022-09-17 1 21 38.51 1.201114
## 2 2021-09-18 1 27 45.03 1.201114
## 3 2022-11-05 1 33 91.62 1.201114
## 4 2023-04-29 1 54 96.99 1.201114
## 5 2023-04-01 1 23 56.79 1.201114
## 6 2023-04-29 1 24 53.46 1.201114
## WEEKS_SINCE_LAUNCH 16SMALL MULTI CUP 20SMALL MULTI JUG 16SMALL 24ONE CUP
## 1 73 1 0 0
## 2 21 1 0 0
## 3 60 1 0 0
## 4 45 1 0 0
## 5 81 1 0 0
## 6 85 1 0 0
## 2L MULTI JUG SUNSET BLAST JUICED GUAVA RECOVERY JACK RESERVE WHITE PITAYA ED
## 1 0 1 0 1 1 0 0 0 0 0 1
## 2 0 1 0 1 1 0 0 0 0 0 1
## 3 0 0 0 0 0 0 0 1 1 0 0
## 4 0 0 0 0 0 1 1 0 0 0 0
## 5 0 0 0 0 0 0 0 1 1 0 0
## 6 0 0 0 0 0 0 0 1 1 0 0
## CASAVA REGION_ARIZONA REGION_CALI_NEVADA REGION_COLORADO REGION_DESERT_SW
## 1 1 0 0 0 0
## 2 1 0 0 0 0
## 3 1 0 0 0 0
## 4 1 0 0 0 0
## 5 1 0 0 0 0
## 6 1 0 0 0 0
## REGION_KANSAS REGION_MOUNTAIN REGION_NEWMEXICO REGION_NOCAL REGION_NORTHERN
## 1 0 0 0 0 1
## 2 0 0 0 0 1
## 3 0 0 0 0 1
## 4 0 0 0 0 1
## 5 0 0 0 0 1
## 6 0 0 0 0 1
## REGION_PRAIRIE REGION_SOCAL ITEM_JUMPIN-FISH ENERGY DRINK CASAVA JACK
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## ITEM_MOONLIT GENTLE DRINK CASAVA ITEM_MOONLIT GENTLE DRINK SUNSET
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## ITEM_MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA
## 1 0
## 2 0
## 3 1
## 4 0
## 5 1
## 6 1
## ITEM_SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA
## 1 1
## 2 1
## 3 0
## 4 0
## 5 0
## 6 0
## ITEM_SUPER-DUPER PITAYA ED ENERGY DRINK CASAVA NO ARTIFICIAL SWEETENERS
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## ITEM_SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK BRAND_DIET MOONLIT
## 1 0 0
## 2 0 0
## 3 0 0
## 4 1 0
## 5 0 0
## 6 0 0
## BRAND_HILL MOISTURE JUMPIN-FISH BRAND_MOONLIT BRAND_MYTHICAL BEVERAGE
## 1 0 0 0
## 2 0 0 0
## 3 0 0 1
## 4 0 0 0
## 5 0 0 1
## 6 0 0 1
## BRAND_SUPER-DUPER JUICED BRAND_SUPER-DUPER PUNCHED BRAND_SUPER-DUPER RECOVERY
## 1 1 0 0
## 2 1 0 0
## 3 0 0 0
## 4 0 0 1
## 5 0 0 0
## 6 0 0 0
#create new week of year column
casava <- casava %>%
mutate(DATE = as.Date(DATE)) %>%
mutate(WEEK_OF_YEAR = lubridate::week(DATE))
#Drop DATE column
casava$DATE <- NULL
# Summarize the dataset
skimr::skim(casava)
| Name | casava |
| Number of rows | 72455 |
| Number of columns | 46 |
| _______________________ | |
| Column type frequency: | |
| numeric | 46 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| CALORIC_SEGMENT | 0 | 1 | 0.67 | 0.47 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▃▁▁▁▇ |
| UNIT_SALES | 0 | 1 | 90.66 | 239.40 | 1.00 | 17.00 | 46.00 | 99.00 | 5349.00 | ▇▁▁▁▁ |
| DOLLAR_SALES | 0 | 1 | 179.99 | 462.52 | 0.25 | 30.70 | 82.93 | 187.00 | 11270.01 | ▇▁▁▁▁ |
| POP_SQMI | 0 | 1 | 1384.01 | 1772.61 | 0.18 | 35.12 | 349.46 | 2474.41 | 6769.35 | ▇▂▂▁▁ |
| WEEKS_SINCE_LAUNCH | 0 | 1 | 48.25 | 31.55 | 0.00 | 22.00 | 43.00 | 72.00 | 124.00 | ▇▇▆▅▂ |
| 16SMALL MULTI CUP | 0 | 1 | 0.65 | 0.48 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▅▁▁▁▇ |
| 20SMALL MULTI JUG | 0 | 1 | 0.01 | 0.11 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| 16SMALL 24ONE CUP | 0 | 1 | 0.00 | 0.02 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| 2L MULTI JUG | 0 | 1 | 0.34 | 0.47 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▅ |
| SUNSET | 0 | 1 | 0.56 | 0.50 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▆▁▁▁▇ |
| BLAST | 0 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ▁▁▇▁▁ |
| JUICED | 0 | 1 | 0.23 | 0.42 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| GUAVA | 0 | 1 | 0.23 | 0.42 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| RECOVERY | 0 | 1 | 0.11 | 0.32 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| JACK | 0 | 1 | 0.11 | 0.32 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| RESERVE | 0 | 1 | 0.24 | 0.42 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| WHITE | 0 | 1 | 0.24 | 0.42 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| PITAYA | 0 | 1 | 0.07 | 0.25 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| ED | 0 | 1 | 0.30 | 0.46 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▃ |
| CASAVA | 0 | 1 | 0.67 | 0.47 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▃▁▁▁▇ |
| REGION_ARIZONA | 0 | 1 | 0.21 | 0.41 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| REGION_CALI_NEVADA | 0 | 1 | 0.03 | 0.18 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| REGION_COLORADO | 0 | 1 | 0.12 | 0.32 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| REGION_DESERT_SW | 0 | 1 | 0.07 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| REGION_KANSAS | 0 | 1 | 0.02 | 0.13 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| REGION_MOUNTAIN | 0 | 1 | 0.10 | 0.30 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| REGION_NEWMEXICO | 0 | 1 | 0.04 | 0.20 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| REGION_NOCAL | 0 | 1 | 0.04 | 0.20 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| REGION_NORTHERN | 0 | 1 | 0.26 | 0.44 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▃ |
| REGION_PRAIRIE | 0 | 1 | 0.02 | 0.15 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| REGION_SOCAL | 0 | 1 | 0.08 | 0.27 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| ITEM_JUMPIN-FISH ENERGY DRINK CASAVA JACK | 0 | 1 | 0.00 | 0.01 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| ITEM_MOONLIT GENTLE DRINK CASAVA | 0 | 1 | 0.02 | 0.14 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| ITEM_MOONLIT GENTLE DRINK SUNSET | 0 | 1 | 0.33 | 0.47 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▃ |
| ITEM_MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA | 0 | 1 | 0.24 | 0.42 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| ITEM_SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA | 0 | 1 | 0.23 | 0.42 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| ITEM_SUPER-DUPER PITAYA ED ENERGY DRINK CASAVA NO ARTIFICIAL SWEETENERS | 0 | 1 | 0.07 | 0.25 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| ITEM_SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK | 0 | 1 | 0.11 | 0.32 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| BRAND_DIET MOONLIT | 0 | 1 | 0.33 | 0.47 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▃ |
| BRAND_HILL MOISTURE JUMPIN-FISH | 0 | 1 | 0.00 | 0.01 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| BRAND_MOONLIT | 0 | 1 | 0.02 | 0.14 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| BRAND_MYTHICAL BEVERAGE | 0 | 1 | 0.24 | 0.42 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| BRAND_SUPER-DUPER JUICED | 0 | 1 | 0.23 | 0.42 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| BRAND_SUPER-DUPER PUNCHED | 0 | 1 | 0.07 | 0.25 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| BRAND_SUPER-DUPER RECOVERY | 0 | 1 | 0.11 | 0.32 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| WEEK_OF_YEAR | 0 | 1 | 26.13 | 15.56 | 1.00 | 12.00 | 26.00 | 40.00 | 53.00 | ▇▇▆▆▇ |
#remove top one percent of unit sales to clean up outliers
df <- casava %>%
filter(UNIT_SALES < quantile(UNIT_SALES, 0.99))
# Split the data
set.seed(123)
df_testtrn <- initial_split(df, prop = 0.8, strata = UNIT_SALES)
Train <- training(df_testtrn)
Test <- testing(df_testtrn)
# Prepare features and labels for XGBoost
train_features <- Train[, -which(names(Train) == "UNIT_SALES")]
train_labels <- Train$UNIT_SALES
test_features <- Test[, -which(names(Test) == "UNIT_SALES")]
test_labels <- Test$UNIT_SALES
# Convert data to DMatrix format
dtrain <- xgb.DMatrix(data = as.matrix(train_features), label = train_labels)
dtest <- xgb.DMatrix(data = as.matrix(test_features), label = test_labels)
# Define XGBoost parameters
set.seed(123)
params <- list(
booster = "gbtree",
objective = "reg:squarederror",
eval_metric = "rmse",
eta = 0.05,
max_depth = 4,
min_child_weight = 3,
subsample = 0.7,
colsample_bytree = 0.6,
lambda = 1,
alpha = 1
)
# Perform cross-validation to find the optimal number of boosting rounds
cv_results <- xgb.cv(
params = params,
data = dtrain,
nfold = 5,
nrounds = 500, # Changed from 'num_boost_round' to 'nrounds'
early_stopping_rounds = 10,
metrics = "rmse",
seed = 123
)
## [1] train-rmse:102.689882+0.780233 test-rmse:102.712290+1.295163
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
##
## [2] train-rmse:98.702083+1.363531 test-rmse:98.740296+2.023720
## [3] train-rmse:94.937884+1.580653 test-rmse:94.981742+2.173403
## [4] train-rmse:90.674844+1.119582 test-rmse:90.711874+1.687706
## [5] train-rmse:86.619910+0.943238 test-rmse:86.649333+1.367550
## [6] train-rmse:82.755484+0.656333 test-rmse:82.774541+0.939479
## [7] train-rmse:79.675260+1.132207 test-rmse:79.700101+1.358782
## [8] train-rmse:76.759897+1.470844 test-rmse:76.800500+1.900275
## [9] train-rmse:73.416355+1.358047 test-rmse:73.457145+1.890681
## [10] train-rmse:71.052956+1.462888 test-rmse:71.092276+1.889188
## [11] train-rmse:68.569834+2.035293 test-rmse:68.606878+2.380012
## [12] train-rmse:65.591955+1.787084 test-rmse:65.631687+2.206648
## [13] train-rmse:63.075809+2.206655 test-rmse:63.116145+2.508841
## [14] train-rmse:61.203811+2.423265 test-rmse:61.242254+2.661690
## [15] train-rmse:59.461200+2.777820 test-rmse:59.503389+3.034561
## [16] train-rmse:57.778471+2.806387 test-rmse:57.829825+3.126994
## [17] train-rmse:55.921283+2.699465 test-rmse:55.968870+3.013889
## [18] train-rmse:54.212742+3.096257 test-rmse:54.271128+3.461112
## [19] train-rmse:52.760410+2.788335 test-rmse:52.821160+3.137513
## [20] train-rmse:51.172859+2.955435 test-rmse:51.237998+3.294276
## [21] train-rmse:49.982374+3.337959 test-rmse:50.049419+3.680094
## [22] train-rmse:47.991008+3.162963 test-rmse:48.057874+3.449763
## [23] train-rmse:46.821312+2.985901 test-rmse:46.892708+3.269121
## [24] train-rmse:45.052367+3.256110 test-rmse:45.130489+3.537563
## [25] train-rmse:44.014604+3.194726 test-rmse:44.097792+3.473161
## [26] train-rmse:42.572573+3.071974 test-rmse:42.652957+3.375914
## [27] train-rmse:41.222055+3.087495 test-rmse:41.301575+3.326996
## [28] train-rmse:39.453078+2.946359 test-rmse:39.532408+3.174277
## [29] train-rmse:38.008798+2.900296 test-rmse:38.093286+3.076730
## [30] train-rmse:36.784878+2.654663 test-rmse:36.872072+2.833746
## [31] train-rmse:35.814742+2.424238 test-rmse:35.892399+2.624422
## [32] train-rmse:34.476246+2.070291 test-rmse:34.564947+2.273299
## [33] train-rmse:33.417028+1.938389 test-rmse:33.499379+2.155314
## [34] train-rmse:32.418829+1.796358 test-rmse:32.510437+1.999621
## [35] train-rmse:31.444095+1.426164 test-rmse:31.544917+1.670367
## [36] train-rmse:30.358567+1.108329 test-rmse:30.456026+1.351819
## [37] train-rmse:29.682774+1.402499 test-rmse:29.793692+1.666687
## [38] train-rmse:28.879992+1.526556 test-rmse:29.001566+1.756459
## [39] train-rmse:28.243808+1.357528 test-rmse:28.370230+1.612884
## [40] train-rmse:27.324583+1.094793 test-rmse:27.452676+1.351420
## [41] train-rmse:26.604741+1.038062 test-rmse:26.735600+1.278487
## [42] train-rmse:26.070942+1.080355 test-rmse:26.203319+1.319672
## [43] train-rmse:25.537087+1.000423 test-rmse:25.674606+1.218219
## [44] train-rmse:25.027773+0.697903 test-rmse:25.170768+0.924103
## [45] train-rmse:24.288867+0.693500 test-rmse:24.439479+0.947837
## [46] train-rmse:23.594417+0.776504 test-rmse:23.757349+1.035764
## [47] train-rmse:23.050716+0.877753 test-rmse:23.215963+1.102088
## [48] train-rmse:22.549829+1.089395 test-rmse:22.721535+1.328736
## [49] train-rmse:22.030301+1.038642 test-rmse:22.204668+1.242223
## [50] train-rmse:21.546810+1.067016 test-rmse:21.727535+1.225743
## [51] train-rmse:21.304801+1.041195 test-rmse:21.486511+1.238556
## [52] train-rmse:20.972408+1.137462 test-rmse:21.160810+1.297066
## [53] train-rmse:20.555603+1.242438 test-rmse:20.750969+1.425080
## [54] train-rmse:20.336318+1.258458 test-rmse:20.532419+1.456648
## [55] train-rmse:20.139963+1.303300 test-rmse:20.339343+1.511819
## [56] train-rmse:19.641997+1.215082 test-rmse:19.845562+1.409409
## [57] train-rmse:19.373971+1.286251 test-rmse:19.582635+1.451991
## [58] train-rmse:19.093356+1.154237 test-rmse:19.297180+1.322007
## [59] train-rmse:18.664548+1.130235 test-rmse:18.867352+1.266556
## [60] train-rmse:18.356747+1.190502 test-rmse:18.565554+1.305706
## [61] train-rmse:18.058009+1.266767 test-rmse:18.269644+1.361181
## [62] train-rmse:17.763066+1.304833 test-rmse:17.976448+1.388284
## [63] train-rmse:17.392349+1.291859 test-rmse:17.611901+1.391164
## [64] train-rmse:17.132474+1.346762 test-rmse:17.355011+1.465197
## [65] train-rmse:16.802423+1.343859 test-rmse:17.029047+1.444147
## [66] train-rmse:16.483817+1.348208 test-rmse:16.713843+1.460448
## [67] train-rmse:16.168683+1.273952 test-rmse:16.402159+1.384061
## [68] train-rmse:15.981736+1.174512 test-rmse:16.222420+1.289637
## [69] train-rmse:15.767996+1.232178 test-rmse:16.010705+1.360615
## [70] train-rmse:15.549390+1.171009 test-rmse:15.789847+1.317939
## [71] train-rmse:15.419484+1.252094 test-rmse:15.660256+1.395755
## [72] train-rmse:15.178119+1.240006 test-rmse:15.423660+1.381931
## [73] train-rmse:15.024933+1.169310 test-rmse:15.269806+1.309344
## [74] train-rmse:14.744385+1.115933 test-rmse:14.989075+1.253997
## [75] train-rmse:14.554456+1.027026 test-rmse:14.796697+1.160290
## [76] train-rmse:14.468998+0.980994 test-rmse:14.715988+1.101812
## [77] train-rmse:14.302921+0.954728 test-rmse:14.550246+1.054157
## [78] train-rmse:14.207751+1.011058 test-rmse:14.459476+1.110396
## [79] train-rmse:13.993363+0.921596 test-rmse:14.249039+1.026188
## [80] train-rmse:13.885177+0.908088 test-rmse:14.142837+1.002259
## [81] train-rmse:13.707482+0.907199 test-rmse:13.972268+0.988394
## [82] train-rmse:13.573616+0.871751 test-rmse:13.841192+0.960323
## [83] train-rmse:13.401598+0.826557 test-rmse:13.671468+0.913425
## [84] train-rmse:13.238998+0.787131 test-rmse:13.511245+0.867230
## [85] train-rmse:13.053808+0.756525 test-rmse:13.330726+0.834952
## [86] train-rmse:12.927942+0.688835 test-rmse:13.208439+0.770070
## [87] train-rmse:12.883840+0.725243 test-rmse:13.166493+0.801005
## [88] train-rmse:12.783741+0.731483 test-rmse:13.073882+0.800239
## [89] train-rmse:12.724299+0.718740 test-rmse:13.015344+0.796116
## [90] train-rmse:12.636796+0.739325 test-rmse:12.928852+0.804918
## [91] train-rmse:12.544169+0.724980 test-rmse:12.832758+0.789029
## [92] train-rmse:12.400237+0.694019 test-rmse:12.688413+0.760329
## [93] train-rmse:12.299069+0.640253 test-rmse:12.585575+0.703267
## [94] train-rmse:12.208106+0.598368 test-rmse:12.493493+0.660656
## [95] train-rmse:12.079379+0.579635 test-rmse:12.366643+0.642090
## [96] train-rmse:11.963787+0.560759 test-rmse:12.250263+0.622924
## [97] train-rmse:11.893249+0.573860 test-rmse:12.180747+0.645342
## [98] train-rmse:11.862918+0.576236 test-rmse:12.148935+0.653339
## [99] train-rmse:11.783112+0.588486 test-rmse:12.069667+0.659628
## [100] train-rmse:11.695365+0.573048 test-rmse:11.981367+0.654339
## [101] train-rmse:11.635983+0.593495 test-rmse:11.922046+0.680130
## [102] train-rmse:11.579567+0.611677 test-rmse:11.866863+0.703926
## [103] train-rmse:11.484293+0.586886 test-rmse:11.772220+0.679067
## [104] train-rmse:11.420143+0.565122 test-rmse:11.710143+0.655879
## [105] train-rmse:11.365886+0.571852 test-rmse:11.659728+0.659930
## [106] train-rmse:11.295451+0.550707 test-rmse:11.589531+0.637279
## [107] train-rmse:11.244492+0.523744 test-rmse:11.540145+0.619710
## [108] train-rmse:11.197286+0.536051 test-rmse:11.497243+0.626526
## [109] train-rmse:11.140789+0.510071 test-rmse:11.443049+0.600816
## [110] train-rmse:11.089424+0.501646 test-rmse:11.390910+0.596461
## [111] train-rmse:11.029752+0.489832 test-rmse:11.332292+0.581800
## [112] train-rmse:10.978429+0.466366 test-rmse:11.281589+0.571699
## [113] train-rmse:10.960257+0.476238 test-rmse:11.263753+0.580611
## [114] train-rmse:10.916419+0.478981 test-rmse:11.221794+0.575503
## [115] train-rmse:10.869904+0.454041 test-rmse:11.175240+0.549511
## [116] train-rmse:10.817713+0.442914 test-rmse:11.124262+0.535468
## [117] train-rmse:10.758573+0.430308 test-rmse:11.065993+0.526622
## [118] train-rmse:10.708887+0.422436 test-rmse:11.016220+0.529604
## [119] train-rmse:10.656801+0.396630 test-rmse:10.964880+0.509991
## [120] train-rmse:10.604107+0.387217 test-rmse:10.909600+0.499444
## [121] train-rmse:10.571951+0.387818 test-rmse:10.878389+0.501600
## [122] train-rmse:10.548940+0.395535 test-rmse:10.857190+0.509593
## [123] train-rmse:10.508228+0.378576 test-rmse:10.818514+0.497348
## [124] train-rmse:10.461029+0.369023 test-rmse:10.773970+0.488230
## [125] train-rmse:10.438132+0.366194 test-rmse:10.752078+0.491521
## [126] train-rmse:10.405665+0.362311 test-rmse:10.720880+0.481806
## [127] train-rmse:10.372919+0.347835 test-rmse:10.687829+0.468087
## [128] train-rmse:10.338219+0.343580 test-rmse:10.654994+0.471222
## [129] train-rmse:10.302982+0.330495 test-rmse:10.618090+0.459364
## [130] train-rmse:10.268150+0.318149 test-rmse:10.582318+0.452690
## [131] train-rmse:10.239830+0.304422 test-rmse:10.553416+0.435535
## [132] train-rmse:10.214925+0.294318 test-rmse:10.528403+0.427561
## [133] train-rmse:10.187567+0.283916 test-rmse:10.499851+0.418485
## [134] train-rmse:10.162352+0.285777 test-rmse:10.472926+0.408416
## [135] train-rmse:10.138565+0.290017 test-rmse:10.449473+0.416144
## [136] train-rmse:10.110289+0.289661 test-rmse:10.420505+0.420958
## [137] train-rmse:10.080557+0.283655 test-rmse:10.391616+0.410724
## [138] train-rmse:10.052450+0.272125 test-rmse:10.364044+0.402295
## [139] train-rmse:10.030154+0.268143 test-rmse:10.343941+0.395366
## [140] train-rmse:10.003584+0.270149 test-rmse:10.314946+0.389151
## [141] train-rmse:9.981108+0.272146 test-rmse:10.292370+0.392469
## [142] train-rmse:9.968613+0.270443 test-rmse:10.280058+0.393368
## [143] train-rmse:9.954507+0.272755 test-rmse:10.266737+0.395634
## [144] train-rmse:9.945733+0.272524 test-rmse:10.257521+0.394620
## [145] train-rmse:9.928398+0.268431 test-rmse:10.240430+0.386672
## [146] train-rmse:9.909482+0.269592 test-rmse:10.221476+0.390443
## [147] train-rmse:9.888770+0.273547 test-rmse:10.200798+0.392274
## [148] train-rmse:9.868002+0.272527 test-rmse:10.180335+0.393892
## [149] train-rmse:9.854154+0.274366 test-rmse:10.166357+0.394305
## [150] train-rmse:9.840402+0.269503 test-rmse:10.150983+0.391375
## [151] train-rmse:9.817040+0.269112 test-rmse:10.127662+0.394175
## [152] train-rmse:9.804457+0.270549 test-rmse:10.115482+0.395990
## [153] train-rmse:9.791049+0.267749 test-rmse:10.103111+0.396191
## [154] train-rmse:9.768323+0.259712 test-rmse:10.079521+0.386205
## [155] train-rmse:9.743681+0.257151 test-rmse:10.055079+0.384576
## [156] train-rmse:9.726397+0.256747 test-rmse:10.038817+0.382729
## [157] train-rmse:9.711468+0.260261 test-rmse:10.023997+0.388367
## [158] train-rmse:9.696066+0.262473 test-rmse:10.010005+0.388253
## [159] train-rmse:9.678501+0.258117 test-rmse:9.990132+0.380896
## [160] train-rmse:9.667259+0.258058 test-rmse:9.979350+0.380847
## [161] train-rmse:9.650305+0.255550 test-rmse:9.962040+0.374532
## [162] train-rmse:9.643441+0.254026 test-rmse:9.955491+0.371921
## [163] train-rmse:9.624506+0.254014 test-rmse:9.938577+0.373106
## [164] train-rmse:9.610376+0.250247 test-rmse:9.924525+0.372352
## [165] train-rmse:9.597436+0.249916 test-rmse:9.911904+0.372420
## [166] train-rmse:9.583782+0.246253 test-rmse:9.899458+0.369973
## [167] train-rmse:9.568854+0.242033 test-rmse:9.885391+0.370372
## [168] train-rmse:9.559104+0.239018 test-rmse:9.874243+0.368307
## [169] train-rmse:9.538059+0.234444 test-rmse:9.853492+0.366974
## [170] train-rmse:9.522909+0.233828 test-rmse:9.839547+0.370557
## [171] train-rmse:9.508664+0.230178 test-rmse:9.824994+0.362440
## [172] train-rmse:9.498856+0.229309 test-rmse:9.816595+0.362519
## [173] train-rmse:9.490717+0.228929 test-rmse:9.808746+0.362582
## [174] train-rmse:9.479309+0.232886 test-rmse:9.797744+0.366530
## [175] train-rmse:9.460476+0.227874 test-rmse:9.780552+0.363627
## [176] train-rmse:9.443375+0.223134 test-rmse:9.763211+0.359304
## [177] train-rmse:9.431810+0.225680 test-rmse:9.752993+0.361286
## [178] train-rmse:9.422489+0.224588 test-rmse:9.744180+0.360350
## [179] train-rmse:9.408737+0.219965 test-rmse:9.731364+0.354629
## [180] train-rmse:9.398624+0.221871 test-rmse:9.720717+0.357535
## [181] train-rmse:9.388951+0.221443 test-rmse:9.710355+0.352998
## [182] train-rmse:9.376351+0.217778 test-rmse:9.696782+0.350414
## [183] train-rmse:9.368590+0.217224 test-rmse:9.689716+0.351356
## [184] train-rmse:9.352060+0.213476 test-rmse:9.670973+0.342864
## [185] train-rmse:9.341866+0.215791 test-rmse:9.661568+0.344894
## [186] train-rmse:9.331442+0.212573 test-rmse:9.651487+0.344799
## [187] train-rmse:9.319191+0.209051 test-rmse:9.638959+0.342304
## [188] train-rmse:9.307088+0.207991 test-rmse:9.626953+0.343549
## [189] train-rmse:9.298179+0.206607 test-rmse:9.618282+0.345937
## [190] train-rmse:9.289605+0.207347 test-rmse:9.610133+0.347884
## [191] train-rmse:9.281917+0.205572 test-rmse:9.603140+0.347166
## [192] train-rmse:9.273835+0.205125 test-rmse:9.595968+0.346556
## [193] train-rmse:9.264224+0.204798 test-rmse:9.588140+0.345883
## [194] train-rmse:9.253228+0.205418 test-rmse:9.577372+0.348348
## [195] train-rmse:9.241636+0.205678 test-rmse:9.567641+0.346881
## [196] train-rmse:9.229632+0.202709 test-rmse:9.556858+0.348469
## [197] train-rmse:9.221069+0.200158 test-rmse:9.548216+0.346199
## [198] train-rmse:9.207564+0.194758 test-rmse:9.535643+0.341696
## [199] train-rmse:9.196649+0.194726 test-rmse:9.524671+0.337533
## [200] train-rmse:9.191955+0.193856 test-rmse:9.520129+0.337843
## [201] train-rmse:9.179259+0.189434 test-rmse:9.507863+0.334176
## [202] train-rmse:9.169101+0.187974 test-rmse:9.497131+0.329239
## [203] train-rmse:9.153462+0.186197 test-rmse:9.481736+0.328483
## [204] train-rmse:9.144964+0.186667 test-rmse:9.473858+0.328080
## [205] train-rmse:9.138452+0.187495 test-rmse:9.468884+0.330553
## [206] train-rmse:9.132580+0.187066 test-rmse:9.463667+0.330729
## [207] train-rmse:9.125553+0.184862 test-rmse:9.456194+0.323541
## [208] train-rmse:9.119051+0.185645 test-rmse:9.449886+0.326456
## [209] train-rmse:9.111306+0.185289 test-rmse:9.442230+0.326988
## [210] train-rmse:9.102033+0.183870 test-rmse:9.433643+0.325617
## [211] train-rmse:9.096477+0.183626 test-rmse:9.428649+0.325943
## [212] train-rmse:9.085712+0.184233 test-rmse:9.419220+0.326119
## [213] train-rmse:9.079890+0.182994 test-rmse:9.413699+0.323949
## [214] train-rmse:9.071769+0.181029 test-rmse:9.406076+0.321504
## [215] train-rmse:9.061844+0.179566 test-rmse:9.396172+0.319577
## [216] train-rmse:9.050595+0.177879 test-rmse:9.385558+0.317910
## [217] train-rmse:9.040974+0.175041 test-rmse:9.377395+0.319038
## [218] train-rmse:9.034094+0.173564 test-rmse:9.370442+0.318016
## [219] train-rmse:9.025810+0.170800 test-rmse:9.362142+0.314908
## [220] train-rmse:9.020363+0.169165 test-rmse:9.356656+0.313846
## [221] train-rmse:9.012264+0.166060 test-rmse:9.349887+0.314868
## [222] train-rmse:9.005342+0.165913 test-rmse:9.343455+0.313804
## [223] train-rmse:8.998773+0.165329 test-rmse:9.336193+0.315482
## [224] train-rmse:8.990972+0.163964 test-rmse:9.329155+0.313307
## [225] train-rmse:8.985557+0.163477 test-rmse:9.324181+0.314096
## [226] train-rmse:8.979931+0.165863 test-rmse:9.318450+0.316042
## [227] train-rmse:8.969303+0.163417 test-rmse:9.307984+0.315869
## [228] train-rmse:8.961827+0.162184 test-rmse:9.300960+0.316621
## [229] train-rmse:8.954631+0.160719 test-rmse:9.294833+0.315701
## [230] train-rmse:8.947914+0.159064 test-rmse:9.287804+0.310467
## [231] train-rmse:8.941145+0.159324 test-rmse:9.282284+0.307436
## [232] train-rmse:8.932537+0.156024 test-rmse:9.273848+0.305573
## [233] train-rmse:8.925758+0.155045 test-rmse:9.268126+0.305893
## [234] train-rmse:8.919624+0.153138 test-rmse:9.262503+0.306788
## [235] train-rmse:8.912716+0.150354 test-rmse:9.254581+0.302972
## [236] train-rmse:8.907617+0.151331 test-rmse:9.249833+0.302216
## [237] train-rmse:8.902585+0.150090 test-rmse:9.244779+0.301269
## [238] train-rmse:8.897156+0.148726 test-rmse:9.239674+0.300181
## [239] train-rmse:8.890984+0.149326 test-rmse:9.233980+0.301389
## [240] train-rmse:8.884421+0.147570 test-rmse:9.227653+0.302575
## [241] train-rmse:8.880146+0.147908 test-rmse:9.223793+0.303488
## [242] train-rmse:8.873267+0.145659 test-rmse:9.216839+0.304004
## [243] train-rmse:8.867021+0.144497 test-rmse:9.210371+0.299925
## [244] train-rmse:8.860797+0.142364 test-rmse:9.205169+0.298167
## [245] train-rmse:8.854486+0.143742 test-rmse:9.198919+0.299736
## [246] train-rmse:8.846043+0.144562 test-rmse:9.191826+0.300122
## [247] train-rmse:8.838592+0.144308 test-rmse:9.185778+0.299641
## [248] train-rmse:8.829448+0.140770 test-rmse:9.176742+0.296749
## [249] train-rmse:8.825030+0.140110 test-rmse:9.172775+0.296628
## [250] train-rmse:8.819097+0.140169 test-rmse:9.167867+0.297909
## [251] train-rmse:8.813894+0.138937 test-rmse:9.163669+0.297420
## [252] train-rmse:8.809403+0.137574 test-rmse:9.159674+0.297060
## [253] train-rmse:8.803683+0.135299 test-rmse:9.154742+0.298260
## [254] train-rmse:8.798818+0.134952 test-rmse:9.150083+0.297477
## [255] train-rmse:8.793535+0.134317 test-rmse:9.144977+0.294710
## [256] train-rmse:8.787952+0.131667 test-rmse:9.139872+0.293806
## [257] train-rmse:8.781135+0.130991 test-rmse:9.133284+0.295634
## [258] train-rmse:8.777143+0.129929 test-rmse:9.129867+0.295065
## [259] train-rmse:8.769674+0.130471 test-rmse:9.123098+0.297029
## [260] train-rmse:8.763330+0.130407 test-rmse:9.116956+0.297072
## [261] train-rmse:8.756798+0.129322 test-rmse:9.109881+0.294106
## [262] train-rmse:8.752204+0.128367 test-rmse:9.105869+0.292401
## [263] train-rmse:8.745731+0.127203 test-rmse:9.099680+0.290743
## [264] train-rmse:8.739232+0.127818 test-rmse:9.094262+0.292436
## [265] train-rmse:8.734261+0.126795 test-rmse:9.089100+0.294061
## [266] train-rmse:8.729095+0.126803 test-rmse:9.084026+0.294060
## [267] train-rmse:8.723178+0.126670 test-rmse:9.077963+0.295944
## [268] train-rmse:8.717808+0.124584 test-rmse:9.072453+0.294002
## [269] train-rmse:8.712367+0.123935 test-rmse:9.067154+0.295312
## [270] train-rmse:8.706577+0.123199 test-rmse:9.060968+0.295027
## [271] train-rmse:8.699446+0.123875 test-rmse:9.054215+0.294961
## [272] train-rmse:8.692744+0.122394 test-rmse:9.048680+0.293942
## [273] train-rmse:8.686876+0.122380 test-rmse:9.043430+0.293734
## [274] train-rmse:8.681089+0.123495 test-rmse:9.037688+0.295458
## [275] train-rmse:8.676207+0.123389 test-rmse:9.032897+0.295966
## [276] train-rmse:8.672217+0.123867 test-rmse:9.029221+0.297095
## [277] train-rmse:8.666440+0.124048 test-rmse:9.023883+0.295691
## [278] train-rmse:8.662211+0.123530 test-rmse:9.019612+0.295833
## [279] train-rmse:8.657963+0.122558 test-rmse:9.015586+0.296335
## [280] train-rmse:8.651971+0.120592 test-rmse:9.010602+0.295805
## [281] train-rmse:8.648824+0.119552 test-rmse:9.008077+0.295589
## [282] train-rmse:8.640981+0.118904 test-rmse:9.000690+0.291440
## [283] train-rmse:8.636444+0.119869 test-rmse:8.996280+0.290680
## [284] train-rmse:8.632246+0.121019 test-rmse:8.991760+0.290731
## [285] train-rmse:8.626691+0.121583 test-rmse:8.986453+0.288197
## [286] train-rmse:8.623153+0.121773 test-rmse:8.983662+0.289596
## [287] train-rmse:8.618866+0.120909 test-rmse:8.979897+0.290130
## [288] train-rmse:8.614476+0.121727 test-rmse:8.975920+0.289932
## [289] train-rmse:8.609075+0.124274 test-rmse:8.970103+0.290498
## [290] train-rmse:8.603500+0.122092 test-rmse:8.965922+0.292216
## [291] train-rmse:8.598647+0.120814 test-rmse:8.960602+0.288102
## [292] train-rmse:8.592922+0.119429 test-rmse:8.954556+0.289028
## [293] train-rmse:8.587861+0.118669 test-rmse:8.949981+0.287435
## [294] train-rmse:8.582219+0.118403 test-rmse:8.944576+0.289784
## [295] train-rmse:8.578663+0.118312 test-rmse:8.941169+0.288191
## [296] train-rmse:8.573927+0.117161 test-rmse:8.937105+0.288778
## [297] train-rmse:8.569051+0.115084 test-rmse:8.932944+0.289161
## [298] train-rmse:8.564935+0.116027 test-rmse:8.929085+0.290867
## [299] train-rmse:8.559643+0.114148 test-rmse:8.924736+0.290040
## [300] train-rmse:8.557081+0.114107 test-rmse:8.922411+0.289541
## [301] train-rmse:8.552046+0.113961 test-rmse:8.917877+0.289033
## [302] train-rmse:8.546947+0.113548 test-rmse:8.913953+0.289977
## [303] train-rmse:8.542004+0.113007 test-rmse:8.909105+0.290286
## [304] train-rmse:8.534340+0.113259 test-rmse:8.902345+0.289642
## [305] train-rmse:8.530323+0.113360 test-rmse:8.898568+0.291441
## [306] train-rmse:8.524599+0.112977 test-rmse:8.893919+0.290491
## [307] train-rmse:8.519382+0.112010 test-rmse:8.889378+0.288221
## [308] train-rmse:8.515482+0.111810 test-rmse:8.886067+0.288743
## [309] train-rmse:8.511361+0.113562 test-rmse:8.882429+0.289770
## [310] train-rmse:8.507339+0.111798 test-rmse:8.878886+0.289247
## [311] train-rmse:8.504201+0.112198 test-rmse:8.877127+0.291976
## [312] train-rmse:8.500366+0.112488 test-rmse:8.873144+0.290968
## [313] train-rmse:8.495998+0.111841 test-rmse:8.869097+0.291137
## [314] train-rmse:8.491633+0.112011 test-rmse:8.865526+0.290485
## [315] train-rmse:8.488577+0.112059 test-rmse:8.862831+0.289537
## [316] train-rmse:8.484472+0.113356 test-rmse:8.858373+0.288132
## [317] train-rmse:8.481425+0.113585 test-rmse:8.855268+0.287982
## [318] train-rmse:8.476089+0.113091 test-rmse:8.849023+0.284105
## [319] train-rmse:8.471962+0.114105 test-rmse:8.845141+0.284487
## [320] train-rmse:8.466069+0.113788 test-rmse:8.839102+0.283357
## [321] train-rmse:8.461702+0.114239 test-rmse:8.835062+0.285226
## [322] train-rmse:8.458019+0.116074 test-rmse:8.831525+0.286963
## [323] train-rmse:8.454229+0.115193 test-rmse:8.828337+0.286917
## [324] train-rmse:8.450663+0.115165 test-rmse:8.826303+0.288908
## [325] train-rmse:8.445919+0.114698 test-rmse:8.822217+0.286673
## [326] train-rmse:8.442293+0.114011 test-rmse:8.818598+0.286938
## [327] train-rmse:8.438979+0.113680 test-rmse:8.815336+0.287565
## [328] train-rmse:8.434216+0.112974 test-rmse:8.811775+0.287072
## [329] train-rmse:8.429879+0.113630 test-rmse:8.807816+0.285661
## [330] train-rmse:8.425727+0.113823 test-rmse:8.804281+0.286277
## [331] train-rmse:8.421887+0.113541 test-rmse:8.800897+0.284239
## [332] train-rmse:8.417911+0.114209 test-rmse:8.798651+0.283248
## [333] train-rmse:8.413847+0.113526 test-rmse:8.794633+0.284324
## [334] train-rmse:8.411417+0.113605 test-rmse:8.792217+0.284816
## [335] train-rmse:8.407450+0.114286 test-rmse:8.788558+0.283517
## [336] train-rmse:8.403794+0.114789 test-rmse:8.785336+0.284982
## [337] train-rmse:8.398787+0.117646 test-rmse:8.780856+0.284771
## [338] train-rmse:8.396425+0.117797 test-rmse:8.779326+0.285043
## [339] train-rmse:8.392445+0.117506 test-rmse:8.775160+0.280358
## [340] train-rmse:8.386736+0.118151 test-rmse:8.770392+0.280905
## [341] train-rmse:8.383048+0.118023 test-rmse:8.767437+0.279058
## [342] train-rmse:8.379728+0.117573 test-rmse:8.765095+0.279662
## [343] train-rmse:8.375833+0.117491 test-rmse:8.762383+0.280367
## [344] train-rmse:8.372060+0.116695 test-rmse:8.758993+0.280416
## [345] train-rmse:8.366936+0.116869 test-rmse:8.754642+0.281481
## [346] train-rmse:8.361933+0.115284 test-rmse:8.750303+0.279365
## [347] train-rmse:8.357452+0.116801 test-rmse:8.745061+0.280778
## [348] train-rmse:8.354712+0.115382 test-rmse:8.742765+0.280844
## [349] train-rmse:8.352015+0.116136 test-rmse:8.739906+0.282051
## [350] train-rmse:8.346868+0.114409 test-rmse:8.736365+0.281574
## [351] train-rmse:8.342710+0.113397 test-rmse:8.732858+0.281329
## [352] train-rmse:8.337940+0.111874 test-rmse:8.729558+0.282076
## [353] train-rmse:8.335345+0.111796 test-rmse:8.727525+0.281774
## [354] train-rmse:8.331049+0.113244 test-rmse:8.723280+0.280515
## [355] train-rmse:8.326952+0.112691 test-rmse:8.719346+0.279671
## [356] train-rmse:8.323652+0.111643 test-rmse:8.716547+0.278764
## [357] train-rmse:8.319223+0.112060 test-rmse:8.712922+0.276219
## [358] train-rmse:8.315540+0.112462 test-rmse:8.709384+0.276912
## [359] train-rmse:8.311421+0.111450 test-rmse:8.705603+0.276602
## [360] train-rmse:8.308662+0.110867 test-rmse:8.703212+0.276282
## [361] train-rmse:8.306142+0.109864 test-rmse:8.701069+0.276283
## [362] train-rmse:8.302051+0.110345 test-rmse:8.697326+0.275360
## [363] train-rmse:8.299490+0.110271 test-rmse:8.695999+0.275246
## [364] train-rmse:8.294761+0.109359 test-rmse:8.692178+0.273835
## [365] train-rmse:8.289644+0.110095 test-rmse:8.687572+0.273107
## [366] train-rmse:8.286680+0.109957 test-rmse:8.684767+0.272776
## [367] train-rmse:8.282881+0.111044 test-rmse:8.681073+0.272865
## [368] train-rmse:8.279471+0.111701 test-rmse:8.678821+0.272302
## [369] train-rmse:8.276706+0.110443 test-rmse:8.676928+0.272265
## [370] train-rmse:8.272687+0.109785 test-rmse:8.673238+0.271694
## [371] train-rmse:8.269869+0.108466 test-rmse:8.671549+0.272235
## [372] train-rmse:8.267546+0.108246 test-rmse:8.669678+0.272108
## [373] train-rmse:8.262809+0.108325 test-rmse:8.665331+0.273371
## [374] train-rmse:8.260179+0.108811 test-rmse:8.663379+0.273604
## [375] train-rmse:8.256976+0.108769 test-rmse:8.660668+0.273304
## [376] train-rmse:8.253582+0.108666 test-rmse:8.657538+0.273504
## [377] train-rmse:8.250138+0.107979 test-rmse:8.654788+0.275059
## [378] train-rmse:8.246836+0.108530 test-rmse:8.651400+0.274510
## [379] train-rmse:8.244481+0.108627 test-rmse:8.649024+0.275649
## [380] train-rmse:8.239697+0.111188 test-rmse:8.645103+0.274424
## [381] train-rmse:8.237070+0.110921 test-rmse:8.642551+0.273709
## [382] train-rmse:8.233694+0.110977 test-rmse:8.640528+0.273643
## [383] train-rmse:8.229069+0.108992 test-rmse:8.635601+0.274816
## [384] train-rmse:8.224621+0.108151 test-rmse:8.631816+0.275402
## [385] train-rmse:8.220769+0.108071 test-rmse:8.628644+0.276042
## [386] train-rmse:8.216255+0.106751 test-rmse:8.624903+0.276680
## [387] train-rmse:8.211975+0.106718 test-rmse:8.620802+0.275735
## [388] train-rmse:8.207489+0.106179 test-rmse:8.616847+0.274495
## [389] train-rmse:8.202949+0.105573 test-rmse:8.612800+0.273785
## [390] train-rmse:8.199502+0.106011 test-rmse:8.609369+0.271596
## [391] train-rmse:8.194150+0.104656 test-rmse:8.604937+0.271292
## [392] train-rmse:8.191524+0.104844 test-rmse:8.602938+0.271049
## [393] train-rmse:8.186738+0.104256 test-rmse:8.598852+0.271595
## [394] train-rmse:8.182681+0.102268 test-rmse:8.595140+0.272974
## [395] train-rmse:8.180128+0.101560 test-rmse:8.592829+0.273328
## [396] train-rmse:8.175660+0.101261 test-rmse:8.589334+0.271899
## [397] train-rmse:8.174261+0.101533 test-rmse:8.587933+0.271778
## [398] train-rmse:8.171317+0.100060 test-rmse:8.585193+0.271646
## [399] train-rmse:8.168709+0.099048 test-rmse:8.583225+0.271773
## [400] train-rmse:8.166076+0.097906 test-rmse:8.581148+0.272330
## [401] train-rmse:8.163575+0.098403 test-rmse:8.578954+0.272782
## [402] train-rmse:8.158012+0.097720 test-rmse:8.574222+0.274629
## [403] train-rmse:8.154116+0.096133 test-rmse:8.570826+0.274974
## [404] train-rmse:8.150174+0.095481 test-rmse:8.567481+0.276702
## [405] train-rmse:8.147309+0.095743 test-rmse:8.564858+0.275559
## [406] train-rmse:8.144409+0.094807 test-rmse:8.562187+0.274224
## [407] train-rmse:8.140903+0.094755 test-rmse:8.559598+0.273337
## [408] train-rmse:8.137098+0.094581 test-rmse:8.556940+0.273858
## [409] train-rmse:8.134405+0.094077 test-rmse:8.554519+0.273619
## [410] train-rmse:8.132254+0.092561 test-rmse:8.552679+0.274133
## [411] train-rmse:8.129830+0.093374 test-rmse:8.550844+0.274591
## [412] train-rmse:8.128153+0.093541 test-rmse:8.549533+0.275026
## [413] train-rmse:8.124656+0.092838 test-rmse:8.546348+0.274941
## [414] train-rmse:8.121510+0.093191 test-rmse:8.543754+0.273746
## [415] train-rmse:8.117702+0.092869 test-rmse:8.540696+0.274347
## [416] train-rmse:8.115480+0.092273 test-rmse:8.539314+0.274346
## [417] train-rmse:8.112507+0.091522 test-rmse:8.536267+0.275698
## [418] train-rmse:8.110397+0.091746 test-rmse:8.534393+0.276187
## [419] train-rmse:8.107155+0.090992 test-rmse:8.532109+0.275996
## [420] train-rmse:8.104655+0.090915 test-rmse:8.530622+0.275597
## [421] train-rmse:8.101663+0.091469 test-rmse:8.527582+0.274279
## [422] train-rmse:8.098822+0.090770 test-rmse:8.525194+0.274080
## [423] train-rmse:8.095387+0.090274 test-rmse:8.523229+0.274242
## [424] train-rmse:8.092198+0.090249 test-rmse:8.519850+0.275152
## [425] train-rmse:8.089833+0.089624 test-rmse:8.517859+0.275580
## [426] train-rmse:8.085585+0.089732 test-rmse:8.515005+0.276292
## [427] train-rmse:8.082400+0.089329 test-rmse:8.512342+0.276278
## [428] train-rmse:8.078796+0.089875 test-rmse:8.509256+0.276792
## [429] train-rmse:8.075849+0.089464 test-rmse:8.506932+0.277166
## [430] train-rmse:8.073041+0.087549 test-rmse:8.504521+0.277748
## [431] train-rmse:8.071242+0.087549 test-rmse:8.502811+0.277996
## [432] train-rmse:8.069029+0.086865 test-rmse:8.501328+0.279035
## [433] train-rmse:8.064213+0.087649 test-rmse:8.498113+0.279256
## [434] train-rmse:8.062222+0.087349 test-rmse:8.496344+0.279523
## [435] train-rmse:8.059366+0.086985 test-rmse:8.494391+0.280362
## [436] train-rmse:8.055438+0.086068 test-rmse:8.491416+0.279421
## [437] train-rmse:8.052108+0.086885 test-rmse:8.488386+0.278579
## [438] train-rmse:8.048850+0.085997 test-rmse:8.486083+0.278815
## [439] train-rmse:8.046628+0.086346 test-rmse:8.483849+0.277993
## [440] train-rmse:8.042889+0.086005 test-rmse:8.480684+0.277843
## [441] train-rmse:8.040497+0.086143 test-rmse:8.478938+0.277881
## [442] train-rmse:8.037734+0.085031 test-rmse:8.476723+0.277461
## [443] train-rmse:8.034520+0.084576 test-rmse:8.473696+0.276319
## [444] train-rmse:8.032463+0.084402 test-rmse:8.471935+0.275350
## [445] train-rmse:8.029952+0.084621 test-rmse:8.469428+0.273860
## [446] train-rmse:8.026661+0.085226 test-rmse:8.466218+0.274976
## [447] train-rmse:8.023922+0.084412 test-rmse:8.464393+0.275190
## [448] train-rmse:8.020094+0.082567 test-rmse:8.461365+0.274554
## [449] train-rmse:8.017666+0.082422 test-rmse:8.459771+0.273872
## [450] train-rmse:8.015571+0.081732 test-rmse:8.458643+0.272889
## [451] train-rmse:8.013494+0.081625 test-rmse:8.456833+0.271866
## [452] train-rmse:8.011926+0.081374 test-rmse:8.455317+0.271847
## [453] train-rmse:8.010041+0.081411 test-rmse:8.453983+0.272148
## [454] train-rmse:8.007793+0.081161 test-rmse:8.452064+0.271696
## [455] train-rmse:8.005199+0.080528 test-rmse:8.450239+0.272341
## [456] train-rmse:8.002216+0.080523 test-rmse:8.448045+0.272736
## [457] train-rmse:7.999707+0.079783 test-rmse:8.445560+0.273918
## [458] train-rmse:7.997831+0.079700 test-rmse:8.444458+0.274007
## [459] train-rmse:7.994603+0.080771 test-rmse:8.440866+0.273367
## [460] train-rmse:7.992879+0.080432 test-rmse:8.439574+0.273518
## [461] train-rmse:7.990083+0.080111 test-rmse:8.436619+0.273167
## [462] train-rmse:7.987834+0.080243 test-rmse:8.434950+0.272523
## [463] train-rmse:7.984344+0.079891 test-rmse:8.431353+0.273276
## [464] train-rmse:7.981525+0.079210 test-rmse:8.428841+0.273198
## [465] train-rmse:7.978375+0.080101 test-rmse:8.427250+0.273470
## [466] train-rmse:7.974630+0.080433 test-rmse:8.424097+0.274155
## [467] train-rmse:7.973415+0.080284 test-rmse:8.423350+0.274252
## [468] train-rmse:7.971295+0.080645 test-rmse:8.421641+0.275041
## [469] train-rmse:7.968706+0.080043 test-rmse:8.419375+0.275431
## [470] train-rmse:7.966567+0.079792 test-rmse:8.417928+0.275207
## [471] train-rmse:7.962298+0.078212 test-rmse:8.416155+0.275486
## [472] train-rmse:7.959742+0.078490 test-rmse:8.413780+0.276457
## [473] train-rmse:7.956085+0.080289 test-rmse:8.410702+0.275732
## [474] train-rmse:7.952344+0.080014 test-rmse:8.409303+0.275206
## [475] train-rmse:7.949541+0.079862 test-rmse:8.406817+0.275842
## [476] train-rmse:7.946075+0.078937 test-rmse:8.404160+0.276621
## [477] train-rmse:7.944195+0.079417 test-rmse:8.402513+0.275579
## [478] train-rmse:7.941987+0.079256 test-rmse:8.400459+0.274787
## [479] train-rmse:7.938580+0.080687 test-rmse:8.397437+0.274617
## [480] train-rmse:7.935722+0.082294 test-rmse:8.395603+0.274863
## [481] train-rmse:7.932792+0.081274 test-rmse:8.392947+0.274775
## [482] train-rmse:7.929390+0.080549 test-rmse:8.389311+0.271597
## [483] train-rmse:7.926961+0.081726 test-rmse:8.387283+0.271725
## [484] train-rmse:7.925202+0.081580 test-rmse:8.384492+0.272148
## [485] train-rmse:7.922907+0.081142 test-rmse:8.382499+0.272672
## [486] train-rmse:7.919974+0.080485 test-rmse:8.380681+0.272544
## [487] train-rmse:7.917350+0.079643 test-rmse:8.379838+0.272426
## [488] train-rmse:7.915029+0.080543 test-rmse:8.377723+0.272524
## [489] train-rmse:7.912765+0.080303 test-rmse:8.375401+0.272688
## [490] train-rmse:7.909595+0.079122 test-rmse:8.373326+0.269522
## [491] train-rmse:7.905735+0.077850 test-rmse:8.370394+0.268919
## [492] train-rmse:7.902571+0.078012 test-rmse:8.369038+0.268340
## [493] train-rmse:7.899653+0.078557 test-rmse:8.366764+0.268550
## [494] train-rmse:7.896992+0.079259 test-rmse:8.365347+0.269079
## [495] train-rmse:7.894135+0.079491 test-rmse:8.363141+0.268657
## [496] train-rmse:7.890822+0.078929 test-rmse:8.361437+0.268940
## [497] train-rmse:7.888291+0.079203 test-rmse:8.359362+0.268691
## [498] train-rmse:7.885632+0.079574 test-rmse:8.357145+0.268666
## [499] train-rmse:7.883213+0.079013 test-rmse:8.355656+0.269265
## [500] train-rmse:7.880386+0.077602 test-rmse:8.353502+0.268309
best_nrounds <- cv_results$best_iteration
# Train the final model using the best number of rounds found
model_xgb <- xgb.train(
params = params,
data = dtrain,
nrounds = best_nrounds
)
# Make predictions and evaluate the model
train_pred <- predict(model_xgb, dtrain)
test_pred <- predict(model_xgb, dtest)
train_rmse <- sqrt(mean((train_labels - train_pred)^2))
test_rmse <- sqrt(mean((test_labels - test_pred)^2))
# Calculate R-squared for the training set
sst_train <- sum((train_labels - mean(train_labels)) ^ 2)
ssr_train <- sum((train_labels - train_pred) ^ 2)
r_squared_train <- 1 - (ssr_train / sst_train)
# Calculate R-squared for the test set
sst_test <- sum((test_labels - mean(test_labels)) ^ 2)
ssr_test <- sum((test_labels - test_pred) ^ 2)
r_squared_test <- 1 - (ssr_test / sst_test)
train_mape <- mean(abs((train_labels - train_pred) / train_labels)) * 100
test_mape <- mean(abs((test_labels - test_pred) / test_labels)) * 100
train_mae <- mean(abs(train_labels - train_pred))
test_mae <- mean(abs(test_labels - test_pred))
cat("Model Performance Metrics:\n",
"--------------------------\n",
"Training RMSE: ", train_rmse, "\n",
"Test RMSE: ", test_rmse, "\n",
"Training R-squared: ", r_squared_train, "\n",
"Test R-squared: ", r_squared_test, "\n",
"Training MAE: ", train_mae, "\n",
"Test MAE: ", test_mae, "\n",
"Training MAPE: ", train_mape, "%\n",
"Test MAPE: ", test_mape, "%\n", sep="")
## Model Performance Metrics:
## --------------------------
## Training RMSE: 7.930191
## Test RMSE: 8.011783
## Training R-squared: 0.9902172
## Test R-squared: 0.9896139
## Training MAE: 4.898334
## Test MAE: 4.971412
## Training MAPE: 16.84768%
## Test MAPE: 17.11487%
# Correcting Residuals Data Frame
# Assuming 'train_labels' and 'test_labels' contain the actual values,
# and 'train_pred' and 'test_pred' contain your model's predictions:
residuals_train <- train_labels - train_pred
residuals_test <- test_labels - test_pred
residuals_data <- data.frame(
Residuals = c(residuals_train, residuals_test),
Dataset = c(rep('Training', length(residuals_train)), rep('Test', length(residuals_test)))
)
# Now plotting residuals with corrected data
ggplot(residuals_data, aes(x = Residuals, fill = Dataset)) +
geom_histogram(binwidth = 1, position = 'identity', alpha = 0.6) +
facet_wrap(~ Dataset) +
ggtitle('Residuals Distribution')
# Assuming train_labels, test_labels, train_pred, and test_pred are correctly defined
# Adjusted Actual vs. Predicted Data Preparation
actual_pred_data <- data.frame(
Actual = c(train_labels, test_labels),
Predicted = c(train_pred, test_pred),
Dataset = c(rep('Training', length(train_labels)), rep('Test', length(test_labels)))
)
# Plotting Actual vs. Predicted Values
ggplot(actual_pred_data, aes(x = Actual, y = Predicted, colour = Dataset)) +
geom_point(alpha = 0.6) +
geom_abline(intercept = 0, slope = 1, linetype = 'dashed', color = 'red') +
xlab('Actual Values') +
ylab('Predicted Values') +
scale_colour_manual(values = c('Training' = 'blue', 'Test' = 'red')) +
ggtitle('Actual vs. Predicted Values')
library(xgboost)
# Calculate feature importance
importance_matrix <- xgb.importance(feature_names = colnames(train_features), model = model_xgb)
# View the feature importance scores
print(importance_matrix)
## Feature
## <char>
## 1: DOLLAR_SALES
## 2: RESERVE
## 3: WHITE
## 4: ITEM_MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA
## 5: BRAND_MYTHICAL BEVERAGE
## 6: POP_SQMI
## 7: WEEKS_SINCE_LAUNCH
## 8: REGION_COLORADO
## 9: REGION_PRAIRIE
## 10: CALORIC_SEGMENT
## 11: REGION_KANSAS
## 12: REGION_NORTHERN
## 13: JUICED
## 14: SUNSET
## 15: REGION_MOUNTAIN
## 16: 16SMALL MULTI CUP
## 17: PITAYA
## 18: WEEK_OF_YEAR
## 19: BRAND_DIET MOONLIT
## 20: REGION_NOCAL
## 21: CASAVA
## 22: REGION_ARIZONA
## 23: REGION_SOCAL
## 24: ITEM_MOONLIT GENTLE DRINK CASAVA
## 25: JACK
## 26: GUAVA
## 27: ED
## 28: ITEM_SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK
## 29: RECOVERY
## 30: REGION_CALI_NEVADA
## 31: REGION_NEWMEXICO
## 32: REGION_DESERT_SW
## 33: ITEM_SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA
## 34: ITEM_SUPER-DUPER PITAYA ED ENERGY DRINK CASAVA NO ARTIFICIAL SWEETENERS
## 35: 2L MULTI JUG
## 36: BRAND_MOONLIT
## 37: 20SMALL MULTI JUG
## 38: BRAND_SUPER-DUPER JUICED
## 39: BRAND_SUPER-DUPER PUNCHED
## 40: 16SMALL 24ONE CUP
## 41: BRAND_SUPER-DUPER RECOVERY
## 42: ITEM_MOONLIT GENTLE DRINK SUNSET
## 43: ITEM_JUMPIN-FISH ENERGY DRINK CASAVA JACK
## Feature
## Gain Cover Frequency
## <num> <num> <num>
## 1: 6.949449e-01 2.983306e-01 0.2036831903
## 2: 9.170854e-02 5.823785e-02 0.0438936583
## 3: 5.057352e-02 2.533318e-02 0.0229853226
## 4: 4.039029e-02 1.272669e-02 0.0090002769
## 5: 2.621033e-02 4.603257e-03 0.0027693160
## 6: 2.470054e-02 1.391316e-01 0.1545278316
## 7: 1.832191e-02 1.600026e-01 0.1741899751
## 8: 1.066675e-02 1.031751e-02 0.0232622542
## 9: 9.336688e-03 8.162987e-03 0.0174466907
## 10: 5.512280e-03 1.655020e-02 0.0281085572
## 11: 5.471806e-03 5.639427e-03 0.0119080587
## 12: 3.451857e-03 2.132392e-02 0.0279700914
## 13: 2.836582e-03 5.393177e-03 0.0168928275
## 14: 2.720214e-03 1.429611e-02 0.0221545278
## 15: 2.172500e-03 1.917322e-02 0.0180005539
## 16: 1.341769e-03 3.251368e-03 0.0085848795
## 17: 1.149949e-03 1.944769e-02 0.0113541955
## 18: 1.020890e-03 7.335864e-02 0.0697867627
## 19: 9.891990e-04 3.840291e-04 0.0001384658
## 20: 9.395797e-04 5.570445e-03 0.0112157297
## 21: 8.923480e-04 6.391102e-03 0.0088618111
## 22: 8.460856e-04 9.899813e-03 0.0222929936
## 23: 6.064193e-04 9.033504e-03 0.0135696483
## 24: 5.770397e-04 1.139353e-02 0.0058155636
## 25: 4.419043e-04 2.750476e-03 0.0036001108
## 26: 3.851232e-04 2.528639e-03 0.0062309610
## 27: 3.406668e-04 9.961911e-03 0.0090002769
## 28: 2.914207e-04 1.380000e-03 0.0022154528
## 29: 2.426000e-04 2.632903e-03 0.0081694821
## 30: 2.347748e-04 2.780865e-03 0.0029077818
## 31: 2.097458e-04 1.697147e-02 0.0120465245
## 32: 1.189504e-04 1.109921e-03 0.0062309610
## 33: 9.019196e-05 1.155648e-03 0.0024923844
## 34: 7.054915e-05 7.105951e-03 0.0040155082
## 35: 6.052376e-05 1.444501e-03 0.0042924398
## 36: 3.529712e-05 3.784081e-03 0.0024923844
## 37: 3.284093e-05 3.041507e-03 0.0020769870
## 38: 2.235556e-05 4.554642e-04 0.0011077264
## 39: 1.761605e-05 2.234395e-03 0.0013846580
## 40: 1.074121e-05 1.831866e-03 0.0016615896
## 41: 7.170666e-06 2.749965e-04 0.0004153974
## 42: 5.514735e-06 5.786144e-04 0.0011077264
## 43: 3.466779e-08 2.433873e-05 0.0001384658
## Gain Cover Frequency
# Plot the feature importance
xgb.plot.importance(importance_matrix = importance_matrix)
# Compute partial dependence data for 'DOLLAR_SALES' and 'CASAVA', CALORIC_SEGMENT, and "ENERGY
# pd <- partial(model_xgb, pred.var = c("DOLLAR_SALES", "CASAVA", "CALORIC_SEGMENT", ENERGY"), train = train_features, grid.resolution = 20)
#
# # Default PDP
# pdp1 <- plotPartial(pd, plot = TRUE)
#
# # Add contour lines and use a different color palette
# rwb <- colorRampPalette(c("red", "white", "blue"))
# pdp2 <- plotPartial(pd, contour = TRUE, col.regions = rwb)
#
# # 3-D surface
# pdp3 <- plotPartial(pd, levelplot = FALSE, zlab = "Predicted Outcome", drape = TRUE, colorkey = TRUE, screen = list(z = -20, x = -60))
#
# # Combine plots into one window
# grid.arrange(pdp1, pdp2, pdp3, ncol = 3)
Model with NO DOLLAR SALES Variable
# Assuming 'df' is your complete dataframe and 'UNIT_SALES' is your target variable
df2 <- df
# Remove DOLLAR_SALES from the features
df2$DOLLAR_SALES <- NULL
# Split the updated data into training and testing sets (assuming you're using a similar approach as before)
set.seed(123)
df2_testtrn <- initial_split(df2, prop = 0.8, strata = UNIT_SALES)
Train <- training(df2_testtrn)
Test <- testing(df2_testtrn)
# Prepare features and labels for XGBoost, excluding DOLLAR_SALES
train_features <- Train[, -which(names(Train) == "UNIT_SALES")]
train_labels <- Train$UNIT_SALES
test_features <- Test[, -which(names(Test) == "UNIT_SALES")]
test_labels <- Test$UNIT_SALES
# Convert data to DMatrix format for XGBoost
dtrain <- xgb.DMatrix(data = as.matrix(train_features), label = train_labels)
dtest <- xgb.DMatrix(data = as.matrix(test_features), label = test_labels)
# Assuming 'params' and 'best_nrounds' are defined as before
# Train the final model without DOLLAR_SALES
model_xgb_no_dollar_sales <- xgb.train(
params = params,
data = dtrain,
nrounds = best_nrounds
)
# Make predictions and evaluate the model
train_pred <- predict(model_xgb_no_dollar_sales, dtrain)
test_pred <- predict(model_xgb_no_dollar_sales, dtest)
train_rmse <- sqrt(mean((train_labels - train_pred)^2))
test_rmse <- sqrt(mean((test_labels - test_pred)^2))
# Calculate R-squared for the training set
sst_train <- sum((train_labels - mean(train_labels)) ^ 2)
ssr_train <- sum((train_labels - train_pred) ^ 2)
r_squared_train <- 1 - (ssr_train / sst_train)
# Calculate R-squared for the test set
sst_test <- sum((test_labels - mean(test_labels)) ^ 2)
ssr_test <- sum((test_labels - test_pred) ^ 2)
r_squared_test <- 1 - (ssr_test / sst_test)
train_mape <- mean(abs((train_labels - train_pred) / train_labels)) * 100
test_mape <- mean(abs((test_labels - test_pred) / test_labels)) * 100
train_mae <- mean(abs(train_labels - train_pred))
test_mae <- mean(abs(test_labels - test_pred))
# Correcting Residuals Data Frame
# Assuming 'train_labels' and 'test_labels' contain the actual values,
# and 'train_pred' and 'test_pred' contain your model's predictions:
residuals_train <- train_labels - train_pred
residuals_test <- test_labels - test_pred
residuals_data <- data.frame(
Residuals = c(residuals_train, residuals_test),
Dataset = c(rep('Training', length(residuals_train)), rep('Test', length(residuals_test)))
)
# Now plotting residuals with corrected data
ggplot(residuals_data, aes(x = Residuals, fill = Dataset)) +
geom_histogram(binwidth = 1, position = 'identity', alpha = 0.6) +
facet_wrap(~ Dataset) +
ggtitle('Residuals Distribution')
# Assuming train_labels, test_labels, train_pred, and test_pred are correctly defined
# Adjusted Actual vs. Predicted Data Preparation
actual_pred_data <- data.frame(
Actual = c(train_labels, test_labels),
Predicted = c(train_pred, test_pred),
Dataset = c(rep('Training', length(train_labels)), rep('Test', length(test_labels)))
)
# Plotting Actual vs. Predicted Values
ggplot(actual_pred_data, aes(x = Actual, y = Predicted, colour = Dataset)) +
geom_point(alpha = 0.6) +
geom_abline(intercept = 0, slope = 1, linetype = 'dashed', color = 'red') +
xlab('Actual Values') +
ylab('Predicted Values') +
scale_colour_manual(values = c('Training' = 'blue', 'Test' = 'red')) +
ggtitle('Actual vs. Predicted Values')
cat("Model Performance Metrics:\n",
"--------------------------\n",
"Training RMSE: ", train_rmse, "\n",
"Test RMSE: ", test_rmse, "\n",
"Training R-squared: ", r_squared_train, "\n",
"Test R-squared: ", r_squared_test, "\n",
"Training MAE: ", train_mae, "\n",
"Test MAE: ", test_mae, "\n",
"Training MAPE: ", train_mape, "%\n",
"Test MAPE: ", test_mape, "%\n", sep="")
## Model Performance Metrics:
## --------------------------
## Training RMSE: 39.07454
## Test RMSE: 39.25329
## Training R-squared: 0.7624885
## Test R-squared: 0.7506866
## Training MAE: 26.52904
## Test MAE: 26.70213
## Training MAPE: 157.1858%
## Test MAPE: 159.6215%
# Calculate feature importance
importance_matrix2 <- xgb.importance(feature_names = colnames(train_features), model = model_xgb_no_dollar_sales)
# View the feature importance scores
print(importance_matrix2)
## Feature
## <char>
## 1: RESERVE
## 2: POP_SQMI
## 3: WHITE
## 4: WEEKS_SINCE_LAUNCH
## 5: REGION_PRAIRIE
## 6: REGION_COLORADO
## 7: CALORIC_SEGMENT
## 8: REGION_KANSAS
## 9: REGION_NORTHERN
## 10: 16SMALL MULTI CUP
## 11: REGION_MOUNTAIN
## 12: SUNSET
## 13: BRAND_MYTHICAL BEVERAGE
## 14: JUICED
## 15: CASAVA
## 16: REGION_NOCAL
## 17: ITEM_MYTHICAL BEVERAGE RESERVE ENERGY DRINK WHITE CASAVA
## 18: REGION_ARIZONA
## 19: WEEK_OF_YEAR
## 20: GUAVA
## 21: ITEM_MOONLIT GENTLE DRINK SUNSET
## 22: RECOVERY
## 23: REGION_DESERT_SW
## 24: REGION_SOCAL
## 25: ED
## 26: REGION_NEWMEXICO
## 27: 2L MULTI JUG
## 28: 16SMALL 24ONE CUP
## 29: PITAYA
## 30: REGION_CALI_NEVADA
## 31: BRAND_DIET MOONLIT
## 32: ITEM_MOONLIT GENTLE DRINK CASAVA
## 33: JACK
## 34: ITEM_SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK
## 35: ITEM_SUPER-DUPER PITAYA ED ENERGY DRINK CASAVA NO ARTIFICIAL SWEETENERS
## 36: BRAND_SUPER-DUPER RECOVERY
## 37: BRAND_SUPER-DUPER PUNCHED
## 38: BRAND_MOONLIT
## 39: ITEM_SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA
## 40: 20SMALL MULTI JUG
## 41: BRAND_SUPER-DUPER JUICED
## Feature
## Gain Cover Frequency
## <num> <num> <num>
## 1: 2.743479e-01 0.0311422969 0.0603201830
## 2: 2.300257e-01 0.4877309541 0.3480560320
## 3: 1.406387e-01 0.0139118842 0.0313036021
## 4: 6.084740e-02 0.1241158505 0.1262149800
## 5: 3.380574e-02 0.0132583230 0.0125786164
## 6: 3.060451e-02 0.0154702145 0.0224413951
## 7: 2.879702e-02 0.0149826183 0.0328759291
## 8: 2.873261e-02 0.0186260407 0.0208690680
## 9: 2.049425e-02 0.0162880098 0.0305889079
## 10: 1.891476e-02 0.0056386017 0.0210120069
## 11: 1.707243e-02 0.0292835808 0.0267295597
## 12: 1.353421e-02 0.0112822221 0.0168667810
## 13: 1.157059e-02 0.0009471601 0.0020011435
## 14: 1.125267e-02 0.0084123235 0.0154373928
## 15: 1.047061e-02 0.0054694362 0.0081475129
## 16: 1.006236e-02 0.0097555855 0.0124356775
## 17: 7.249893e-03 0.0043539325 0.0088622070
## 18: 6.108128e-03 0.0144459561 0.0180102916
## 19: 5.298023e-03 0.0365974367 0.0530303030
## 20: 5.183412e-03 0.0030446432 0.0081475129
## 21: 3.324229e-03 0.0029035100 0.0045740423
## 22: 3.296834e-03 0.0035136964 0.0107204117
## 23: 2.983453e-03 0.0135743502 0.0110062893
## 24: 2.786177e-03 0.0046931602 0.0092910234
## 25: 2.774711e-03 0.0040798356 0.0084333905
## 26: 2.728908e-03 0.0100883374 0.0114351058
## 27: 2.600210e-03 0.0010330629 0.0050028588
## 28: 1.952294e-03 0.0344090695 0.0105774728
## 29: 1.856985e-03 0.0188922547 0.0131503716
## 30: 1.778773e-03 0.0118920100 0.0071469411
## 31: 1.714332e-03 0.0006678825 0.0011435106
## 32: 1.459442e-03 0.0063401330 0.0042881647
## 33: 1.319734e-03 0.0049322759 0.0074328188
## 34: 1.309078e-03 0.0008901489 0.0025728988
## 35: 8.565183e-04 0.0073977413 0.0052887364
## 36: 7.154599e-04 0.0011473592 0.0017152659
## 37: 4.878142e-04 0.0039123637 0.0030017153
## 38: 4.269584e-04 0.0021826761 0.0018582047
## 39: 3.633528e-04 0.0015089903 0.0021440823
## 40: 1.625356e-04 0.0006477580 0.0024299600
## 41: 9.124767e-05 0.0005363135 0.0008576329
## Gain Cover Frequency
xgb.plot.importance(importance_matrix = importance_matrix2)
if (!requireNamespace("pdp", quietly = TRUE)) install.packages("pdp")
if (!requireNamespace("xgboost", quietly = TRUE)) install.packages("xgboost")
library(pdp)
library(xgboost)
pdp::partial(model_xgb_no_dollar_sales, pred.var = "WEEK_OF_YEAR", train = train_features)
## WEEK_OF_YEAR yhat
## 1 1.00 72.15863
## 2 2.04 72.16389
## 3 3.08 70.88787
## 4 4.12 70.90670
## 5 5.16 71.13975
## 6 6.20 71.17511
## 7 7.24 71.18012
## 8 8.28 70.85040
## 9 9.32 70.81431
## 10 10.36 70.95667
## 11 11.40 71.07068
## 12 12.44 71.07678
## 13 13.48 71.11003
## 14 14.52 71.31220
## 15 15.56 71.27513
## 16 16.60 71.61472
## 17 17.64 71.48038
## 18 18.68 71.62180
## 19 19.72 71.82469
## 20 20.76 71.93246
## 21 21.80 71.90557
## 22 22.84 71.77916
## 23 23.88 71.66963
## 24 24.92 71.84245
## 25 25.96 71.74708
## 26 27.00 71.67656
## 27 28.04 71.75623
## 28 29.08 71.58050
## 29 30.12 70.44575
## 30 31.16 70.44596
## 31 32.20 70.84234
## 32 33.24 70.97563
## 33 34.28 70.45476
## 34 35.32 70.46765
## 35 36.36 70.72899
## 36 37.40 71.29672
## 37 38.44 71.25609
## 38 39.48 71.09182
## 39 40.52 70.83648
## 40 41.56 70.75148
## 41 42.60 71.28568
## 42 43.64 71.26201
## 43 44.68 71.09134
## 44 45.72 71.08036
## 45 46.76 71.00827
## 46 47.80 71.92899
## 47 48.84 73.60783
## 48 49.88 74.22794
## 49 50.92 74.69990
## 50 51.96 75.44216
## 51 53.00 74.99059
pd <- partial(model_xgb_no_dollar_sales, pred.var = "WEEK_OF_YEAR", train = train_features, grid.resolution = 20)
# Default PDP
pdp1 <- plotPartial(pd, plot = TRUE)
# plot
grid.arrange(pdp1)
Based on the Casava Energy Drink innovation datafram we expect the best 6 months to be between about weeks 20 and weeks 41. Form looking at Plum, we know that the dummy data predictions do not work well with the data and XGBoost.
#cleanup all obj
rm(list = ls())
Stochastic Weighted Average model for Casava Energy Drink, this dumps out the predicted demand. We then used Excel to calculate the optimal order quantity based on the predicted demand and the cost of overstocking and understocking using a simple NewsVendor Model. All visualtions were done in Tabelau.
df <- read_csv("casava_tableau.csv") #inject the df and we will sub-sample
## Rows: 72455 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): CATEGORY, MANUFACTURER, BRAND, PACKAGE, ITEM, REGION, SEASON, PACK...
## dbl (7): MARKET_KEY, CALORIC_SEGMENT, UNIT_SALES, DOLLAR_SALES, POP_SQMI, M...
## date (2): DATE, min_launch_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 17
## MARKET_KEY DATE CALORIC_SEGMENT CATEGORY UNIT_SALES DOLLAR_SALES
## <dbl> <date> <dbl> <chr> <dbl> <dbl>
## 1 1 2022-09-17 1 ENERGY 21 38.5
## 2 1 2021-09-18 1 ENERGY 27 45.0
## 3 1 2022-11-05 1 ENERGY 33 91.6
## 4 1 2023-04-29 1 ENERGY 54 97.0
## 5 1 2023-04-01 1 ENERGY 23 56.8
## 6 1 2023-04-29 1 ENERGY 24 53.5
## # ℹ 11 more variables: MANUFACTURER <chr>, BRAND <chr>, PACKAGE <chr>,
## # ITEM <chr>, POP_SQMI <dbl>, REGION <chr>, MONTH <dbl>, SEASON <chr>,
## # PACKAGE2 <chr>, min_launch_date <date>, WEEKS_SINCE_LAUNCH <dbl>
#filter package by "16SMALL MULTI CUP" and filter out non super-duper
df <- df %>%
filter(PACKAGE == "16SMALL MULTI CUP",
# str_detect(ITEM,"SUPER-DUPER|MYTHICAL"))
str_detect(ITEM,"SUPER-DUPER"))
print(unique(df$ITEM))
## [1] "SUPER-DUPER JUICED ENERGY DRINK CASAVA SUNSET GUAVA"
## [2] "SUPER-DUPER RECOVERY ENERGY DRINK CASAVA JACK"
## [3] "SUPER-DUPER PITAYA ED ENERGY DRINK CASAVA NO ARTIFICIAL SWEETENERS"
#SUM of Sales of "SUPER-DUPER JUICED CASAVA SUNSET GUAVA", "SUPER-DUPER PITAYA CASAVA", "SUPER-DUPER RECOVERY CASAVA JACK" by WEEKS_SINCE_LAUNCH keep DATE column. Average super duper sales and take ratio of Diet Moonlit to Regular
df <- df %>%
group_by(WEEKS_SINCE_LAUNCH) %>%
summarise(INNOVATION_UNIT_SALES = (sum(UNIT_SALES)) * 0.5/3) %>%
ungroup() # This ungroups the df after summarizing, making it easier to work with
# Now you can select all three columns: WEEKS_SINCE_LAUNCH, SUM_UNIT_SALES, and DATE
df <- df %>%
select(WEEKS_SINCE_LAUNCH, INNOVATION_UNIT_SALES)
# #Convert WEEKS_SINCE_LAUNCH to a DATE starting at week 20 of 2021
# df$DATE <- as.Date("2021-05-17") + (df$WEEKS_SINCE_LAUNCH - 1) * 7
# df <- df %>%
# select(DATE, INNOVATION_UNIT_SALES)
#
# head(df)
df
## # A tibble: 109 × 2
## WEEKS_SINCE_LAUNCH INNOVATION_UNIT_SALES
## <dbl> <dbl>
## 1 0 10.7
## 2 1 151.
## 3 2 760.
## 4 3 1670.
## 5 4 2689.
## 6 5 3553
## 7 6 4628.
## 8 7 5804.
## 9 8 5802.
## 10 9 4768.
## # ℹ 99 more rows
write_csv(df,"cassava_newsvendor.csv")